From a65ab3b154021db1c9e9aeedadbf56976cebb574 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Geron?= Date: Thu, 13 Jul 2017 11:13:37 +0200 Subject: [PATCH] Fixes #56, bug in DNNClassifier for batch normalization --- 11_deep_learning.ipynb | 731 +++++++++++++++++++++++------------------ 1 file changed, 405 insertions(+), 326 deletions(-) diff --git a/11_deep_learning.ipynb b/11_deep_learning.ipynb index 999c4c2..4a80148 100644 --- a/11_deep_learning.ipynb +++ b/11_deep_learning.ipynb @@ -145,7 +145,9 @@ { "cell_type": "code", "execution_count": 5, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "reset_graph()\n", @@ -159,7 +161,9 @@ { "cell_type": "code", "execution_count": 6, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "he_init = tf.contrib.layers.variance_scaling_initializer()\n", @@ -235,7 +239,9 @@ { "cell_type": "code", "execution_count": 10, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "def leaky_relu(z, name=None):\n", @@ -282,7 +288,9 @@ { "cell_type": "code", "execution_count": 13, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "with tf.name_scope(\"dnn\"):\n", @@ -396,7 +404,9 @@ { "cell_type": "code", "execution_count": 20, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "def elu(z, alpha=1):\n", @@ -444,7 +454,9 @@ { "cell_type": "code", "execution_count": 23, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "hidden1 = tf.layers.dense(X, n_hidden1, activation=tf.nn.elu, name=\"hidden1\")" @@ -655,7 +667,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 30, "metadata": { "collapsed": true }, @@ -689,7 +701,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 31, "metadata": { "collapsed": true }, @@ -710,8 +722,10 @@ }, { "cell_type": "code", - "execution_count": 26, - "metadata": {}, + "execution_count": 32, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "from functools import partial\n", @@ -738,8 +752,10 @@ }, { "cell_type": "code", - "execution_count": 27, - "metadata": {}, + "execution_count": 33, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "reset_graph()\n", @@ -794,7 +810,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 34, "metadata": { "collapsed": true }, @@ -806,7 +822,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 35, "metadata": {}, "outputs": [], "source": [ @@ -863,7 +879,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 36, "metadata": {}, "outputs": [], "source": [ @@ -872,7 +888,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 37, "metadata": {}, "outputs": [], "source": [ @@ -895,7 +911,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 38, "metadata": { "collapsed": true }, @@ -929,7 +945,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 39, "metadata": { "collapsed": true }, @@ -947,8 +963,10 @@ }, { "cell_type": "code", - "execution_count": 34, - "metadata": {}, + "execution_count": 40, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "threshold = 1.0\n", @@ -969,7 +987,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 41, "metadata": { "collapsed": true }, @@ -982,7 +1000,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 42, "metadata": { "collapsed": true }, @@ -994,7 +1012,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 43, "metadata": { "collapsed": true }, @@ -1006,7 +1024,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 44, "metadata": {}, "outputs": [], "source": [ @@ -1046,7 +1064,7 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 45, "metadata": { "collapsed": true }, @@ -1057,7 +1075,7 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 46, "metadata": { "collapsed": true }, @@ -1075,7 +1093,7 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 47, "metadata": {}, "outputs": [], "source": [ @@ -1092,7 +1110,7 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 48, "metadata": { "collapsed": true }, @@ -1138,7 +1156,7 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 49, "metadata": { "scrolled": true }, @@ -1156,8 +1174,10 @@ }, { "cell_type": "code", - "execution_count": 44, - "metadata": {}, + "execution_count": 50, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "X = tf.get_default_graph().get_tensor_by_name(\"X:0\")\n", @@ -1177,8 +1197,10 @@ }, { "cell_type": "code", - "execution_count": 45, - "metadata": {}, + "execution_count": 51, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "for op in (X, y, accuracy, training_op):\n", @@ -1194,8 +1216,10 @@ }, { "cell_type": "code", - "execution_count": 46, - "metadata": {}, + "execution_count": 52, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "X, y, accuracy, training_op = tf.get_collection(\"my_important_ops\")" @@ -1210,7 +1234,7 @@ }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 53, "metadata": {}, "outputs": [], "source": [ @@ -1228,7 +1252,7 @@ }, { "cell_type": "code", - "execution_count": 48, + "execution_count": 54, "metadata": {}, "outputs": [], "source": [ @@ -1255,7 +1279,7 @@ }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 55, "metadata": { "collapsed": true }, @@ -1311,7 +1335,7 @@ }, { "cell_type": "code", - "execution_count": 50, + "execution_count": 56, "metadata": {}, "outputs": [], "source": [ @@ -1338,8 +1362,10 @@ }, { "cell_type": "code", - "execution_count": 51, - "metadata": {}, + "execution_count": 57, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "reset_graph()\n", @@ -1382,7 +1408,7 @@ }, { "cell_type": "code", - "execution_count": 52, + "execution_count": 58, "metadata": {}, "outputs": [], "source": [ @@ -1410,7 +1436,7 @@ }, { "cell_type": "code", - "execution_count": 53, + "execution_count": 59, "metadata": { "collapsed": true }, @@ -1457,7 +1483,7 @@ }, { "cell_type": "code", - "execution_count": 54, + "execution_count": 60, "metadata": {}, "outputs": [], "source": [ @@ -1500,7 +1526,7 @@ }, { "cell_type": "code", - "execution_count": 55, + "execution_count": 61, "metadata": { "collapsed": true }, @@ -1514,7 +1540,7 @@ }, { "cell_type": "code", - "execution_count": 56, + "execution_count": 62, "metadata": {}, "outputs": [], "source": [ @@ -1556,7 +1582,7 @@ }, { "cell_type": "code", - "execution_count": 57, + "execution_count": 63, "metadata": {}, "outputs": [], "source": [ @@ -1602,7 +1628,7 @@ }, { "cell_type": "code", - "execution_count": 58, + "execution_count": 64, "metadata": {}, "outputs": [], "source": [ @@ -1618,7 +1644,7 @@ }, { "cell_type": "code", - "execution_count": 59, + "execution_count": 65, "metadata": {}, "outputs": [], "source": [ @@ -1627,7 +1653,7 @@ }, { "cell_type": "code", - "execution_count": 60, + "execution_count": 66, "metadata": {}, "outputs": [], "source": [ @@ -1643,8 +1669,10 @@ }, { "cell_type": "code", - "execution_count": 61, - "metadata": {}, + "execution_count": 67, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "reset_graph()\n", @@ -1677,7 +1705,7 @@ }, { "cell_type": "code", - "execution_count": 62, + "execution_count": 68, "metadata": { "collapsed": true }, @@ -1692,7 +1720,7 @@ }, { "cell_type": "code", - "execution_count": 63, + "execution_count": 69, "metadata": { "collapsed": true }, @@ -1704,7 +1732,7 @@ }, { "cell_type": "code", - "execution_count": 64, + "execution_count": 70, "metadata": {}, "outputs": [], "source": [ @@ -1733,7 +1761,7 @@ }, { "cell_type": "code", - "execution_count": 65, + "execution_count": 71, "metadata": { "collapsed": true }, @@ -1754,8 +1782,10 @@ }, { "cell_type": "code", - "execution_count": 66, - "metadata": {}, + "execution_count": 72, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "with tf.name_scope(\"dnn\"):\n", @@ -1773,7 +1803,7 @@ }, { "cell_type": "code", - "execution_count": 67, + "execution_count": 73, "metadata": { "collapsed": true }, @@ -1801,7 +1831,7 @@ }, { "cell_type": "code", - "execution_count": 68, + "execution_count": 74, "metadata": {}, "outputs": [], "source": [ @@ -1837,7 +1867,7 @@ }, { "cell_type": "code", - "execution_count": 69, + "execution_count": 75, "metadata": { "collapsed": true }, @@ -1882,7 +1912,7 @@ }, { "cell_type": "code", - "execution_count": 70, + "execution_count": 76, "metadata": { "collapsed": true }, @@ -1899,7 +1929,7 @@ }, { "cell_type": "code", - "execution_count": 71, + "execution_count": 77, "metadata": {}, "outputs": [], "source": [ @@ -1944,7 +1974,7 @@ }, { "cell_type": "code", - "execution_count": 72, + "execution_count": 78, "metadata": { "collapsed": true }, @@ -1963,7 +1993,7 @@ }, { "cell_type": "code", - "execution_count": 73, + "execution_count": 79, "metadata": { "collapsed": true }, @@ -1982,7 +2012,7 @@ }, { "cell_type": "code", - "execution_count": 74, + "execution_count": 80, "metadata": { "collapsed": true }, @@ -2000,7 +2030,7 @@ }, { "cell_type": "code", - "execution_count": 75, + "execution_count": 81, "metadata": { "collapsed": true }, @@ -2019,7 +2049,7 @@ }, { "cell_type": "code", - "execution_count": 76, + "execution_count": 82, "metadata": { "collapsed": true }, @@ -2037,7 +2067,7 @@ }, { "cell_type": "code", - "execution_count": 77, + "execution_count": 83, "metadata": { "collapsed": true }, @@ -2069,7 +2099,7 @@ }, { "cell_type": "code", - "execution_count": 78, + "execution_count": 84, "metadata": { "collapsed": true }, @@ -2088,8 +2118,10 @@ }, { "cell_type": "code", - "execution_count": 79, - "metadata": {}, + "execution_count": 85, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "init = tf.global_variables_initializer()\n", @@ -2098,7 +2130,7 @@ }, { "cell_type": "code", - "execution_count": 80, + "execution_count": 86, "metadata": {}, "outputs": [], "source": [ @@ -2141,7 +2173,7 @@ }, { "cell_type": "code", - "execution_count": 81, + "execution_count": 87, "metadata": { "collapsed": true }, @@ -2170,8 +2202,10 @@ }, { "cell_type": "code", - "execution_count": 82, - "metadata": {}, + "execution_count": 88, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "W1 = tf.get_default_graph().get_tensor_by_name(\"hidden1/kernel:0\")\n", @@ -2194,145 +2228,12 @@ "The rest is just as usual:" ] }, - { - "cell_type": "code", - "execution_count": 83, - "metadata": {}, - "outputs": [], - "source": [ - "with tf.name_scope(\"eval\"):\n", - " correct = tf.nn.in_top_k(logits, y, 1)\n", - " accuracy = tf.reduce_mean(tf.cast(correct, tf.float32), name=\"accuracy\")\n", - "\n", - "learning_rate = 0.01\n", - "\n", - "with tf.name_scope(\"train\"):\n", - " optimizer = tf.train.GradientDescentOptimizer(learning_rate)\n", - " training_op = optimizer.minimize(loss)\n", - "\n", - "init = tf.global_variables_initializer()\n", - "saver = tf.train.Saver()" - ] - }, - { - "cell_type": "code", - "execution_count": 84, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "n_epochs = 20\n", - "batch_size = 200\n", - "\n", - "with tf.Session() as sess:\n", - " init.run()\n", - " for epoch in range(n_epochs):\n", - " for iteration in range(mnist.train.num_examples // batch_size):\n", - " X_batch, y_batch = mnist.train.next_batch(batch_size)\n", - " sess.run(training_op, feed_dict={X: X_batch, y: y_batch})\n", - " accuracy_val = accuracy.eval(feed_dict={X: mnist.test.images,\n", - " y: mnist.test.labels})\n", - " print(epoch, \"Test accuracy:\", accuracy_val)\n", - "\n", - " save_path = saver.save(sess, \"./my_model_final.ckpt\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Alternatively, we can pass a regularization function to the `tf.layers.dense()` function, which will use it to create operations that will compute the regularization loss, and it adds these operations to the collection of regularization losses. The beginning is the same as above:" - ] - }, - { - "cell_type": "code", - "execution_count": 85, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "reset_graph()\n", - "\n", - "n_inputs = 28 * 28 # MNIST\n", - "n_hidden1 = 300\n", - "n_hidden2 = 50\n", - "n_outputs = 10\n", - "\n", - "X = tf.placeholder(tf.float32, shape=(None, n_inputs), name=\"X\")\n", - "y = tf.placeholder(tf.int64, shape=(None), name=\"y\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Next, we will use Python's `partial()` function to avoid repeating the same arguments over and over again. Note that we set the `kernel_regularizer` argument:" - ] - }, - { - "cell_type": "code", - "execution_count": 86, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "scale = 0.001" - ] - }, - { - "cell_type": "code", - "execution_count": 87, - "metadata": {}, - "outputs": [], - "source": [ - "my_dense_layer = partial(\n", - " tf.layers.dense, activation=tf.nn.relu,\n", - " kernel_regularizer=tf.contrib.layers.l1_regularizer(scale))\n", - "\n", - "with tf.name_scope(\"dnn\"):\n", - " hidden1 = my_dense_layer(X, n_hidden1, name=\"hidden1\")\n", - " hidden2 = my_dense_layer(hidden1, n_hidden2, name=\"hidden2\")\n", - " logits = my_dense_layer(hidden2, n_outputs, activation=None,\n", - " name=\"outputs\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Next we must add the regularization losses to the base loss:" - ] - }, - { - "cell_type": "code", - "execution_count": 88, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "with tf.name_scope(\"loss\"): # not shown in the book\n", - " xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits( # not shown\n", - " labels=y, logits=logits) # not shown\n", - " base_loss = tf.reduce_mean(xentropy, name=\"avg_xentropy\") # not shown\n", - " reg_losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)\n", - " loss = tf.add_n([base_loss] + reg_losses, name=\"loss\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "And the rest is the same as usual:" - ] - }, { "cell_type": "code", "execution_count": 89, - "metadata": {}, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "with tf.name_scope(\"eval\"):\n", @@ -2373,6 +2274,145 @@ " save_path = saver.save(sess, \"./my_model_final.ckpt\")" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Alternatively, we can pass a regularization function to the `tf.layers.dense()` function, which will use it to create operations that will compute the regularization loss, and it adds these operations to the collection of regularization losses. The beginning is the same as above:" + ] + }, + { + "cell_type": "code", + "execution_count": 91, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "reset_graph()\n", + "\n", + "n_inputs = 28 * 28 # MNIST\n", + "n_hidden1 = 300\n", + "n_hidden2 = 50\n", + "n_outputs = 10\n", + "\n", + "X = tf.placeholder(tf.float32, shape=(None, n_inputs), name=\"X\")\n", + "y = tf.placeholder(tf.int64, shape=(None), name=\"y\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next, we will use Python's `partial()` function to avoid repeating the same arguments over and over again. Note that we set the `kernel_regularizer` argument:" + ] + }, + { + "cell_type": "code", + "execution_count": 92, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "scale = 0.001" + ] + }, + { + "cell_type": "code", + "execution_count": 93, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "my_dense_layer = partial(\n", + " tf.layers.dense, activation=tf.nn.relu,\n", + " kernel_regularizer=tf.contrib.layers.l1_regularizer(scale))\n", + "\n", + "with tf.name_scope(\"dnn\"):\n", + " hidden1 = my_dense_layer(X, n_hidden1, name=\"hidden1\")\n", + " hidden2 = my_dense_layer(hidden1, n_hidden2, name=\"hidden2\")\n", + " logits = my_dense_layer(hidden2, n_outputs, activation=None,\n", + " name=\"outputs\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next we must add the regularization losses to the base loss:" + ] + }, + { + "cell_type": "code", + "execution_count": 94, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "with tf.name_scope(\"loss\"): # not shown in the book\n", + " xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits( # not shown\n", + " labels=y, logits=logits) # not shown\n", + " base_loss = tf.reduce_mean(xentropy, name=\"avg_xentropy\") # not shown\n", + " reg_losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)\n", + " loss = tf.add_n([base_loss] + reg_losses, name=\"loss\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And the rest is the same as usual:" + ] + }, + { + "cell_type": "code", + "execution_count": 95, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "with tf.name_scope(\"eval\"):\n", + " correct = tf.nn.in_top_k(logits, y, 1)\n", + " accuracy = tf.reduce_mean(tf.cast(correct, tf.float32), name=\"accuracy\")\n", + "\n", + "learning_rate = 0.01\n", + "\n", + "with tf.name_scope(\"train\"):\n", + " optimizer = tf.train.GradientDescentOptimizer(learning_rate)\n", + " training_op = optimizer.minimize(loss)\n", + "\n", + "init = tf.global_variables_initializer()\n", + "saver = tf.train.Saver()" + ] + }, + { + "cell_type": "code", + "execution_count": 96, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "n_epochs = 20\n", + "batch_size = 200\n", + "\n", + "with tf.Session() as sess:\n", + " init.run()\n", + " for epoch in range(n_epochs):\n", + " for iteration in range(mnist.train.num_examples // batch_size):\n", + " X_batch, y_batch = mnist.train.next_batch(batch_size)\n", + " sess.run(training_op, feed_dict={X: X_batch, y: y_batch})\n", + " accuracy_val = accuracy.eval(feed_dict={X: mnist.test.images,\n", + " y: mnist.test.labels})\n", + " print(epoch, \"Test accuracy:\", accuracy_val)\n", + "\n", + " save_path = saver.save(sess, \"./my_model_final.ckpt\")" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -2391,7 +2431,7 @@ }, { "cell_type": "code", - "execution_count": 91, + "execution_count": 97, "metadata": { "collapsed": true }, @@ -2405,8 +2445,10 @@ }, { "cell_type": "code", - "execution_count": 92, - "metadata": {}, + "execution_count": 98, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "training = tf.placeholder_with_default(False, shape=(), name='training')\n", @@ -2426,8 +2468,10 @@ }, { "cell_type": "code", - "execution_count": 93, - "metadata": {}, + "execution_count": 99, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "with tf.name_scope(\"loss\"):\n", @@ -2448,7 +2492,7 @@ }, { "cell_type": "code", - "execution_count": 94, + "execution_count": 100, "metadata": { "scrolled": true }, @@ -2485,8 +2529,10 @@ }, { "cell_type": "code", - "execution_count": 95, - "metadata": {}, + "execution_count": 101, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "reset_graph()\n", @@ -2529,8 +2575,10 @@ }, { "cell_type": "code", - "execution_count": 96, - "metadata": {}, + "execution_count": 102, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "threshold = 1.0\n", @@ -2548,7 +2596,7 @@ }, { "cell_type": "code", - "execution_count": 97, + "execution_count": 103, "metadata": { "collapsed": true }, @@ -2568,7 +2616,7 @@ }, { "cell_type": "code", - "execution_count": 98, + "execution_count": 104, "metadata": { "collapsed": true }, @@ -2587,7 +2635,7 @@ }, { "cell_type": "code", - "execution_count": 99, + "execution_count": 105, "metadata": { "collapsed": true }, @@ -2599,7 +2647,7 @@ }, { "cell_type": "code", - "execution_count": 100, + "execution_count": 106, "metadata": {}, "outputs": [], "source": [ @@ -2627,7 +2675,7 @@ }, { "cell_type": "code", - "execution_count": 101, + "execution_count": 107, "metadata": { "collapsed": true }, @@ -2652,8 +2700,10 @@ }, { "cell_type": "code", - "execution_count": 102, - "metadata": {}, + "execution_count": 108, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "reset_graph()\n", @@ -2672,7 +2722,7 @@ }, { "cell_type": "code", - "execution_count": 103, + "execution_count": 109, "metadata": { "collapsed": true }, @@ -2690,7 +2740,7 @@ }, { "cell_type": "code", - "execution_count": 104, + "execution_count": 110, "metadata": { "collapsed": true }, @@ -2721,7 +2771,7 @@ }, { "cell_type": "code", - "execution_count": 105, + "execution_count": 111, "metadata": { "collapsed": true }, @@ -2733,7 +2783,7 @@ }, { "cell_type": "code", - "execution_count": 106, + "execution_count": 112, "metadata": { "scrolled": false }, @@ -2808,7 +2858,7 @@ }, { "cell_type": "code", - "execution_count": 107, + "execution_count": 113, "metadata": { "collapsed": true }, @@ -2828,7 +2878,7 @@ }, { "cell_type": "code", - "execution_count": 108, + "execution_count": 114, "metadata": { "collapsed": true }, @@ -2871,7 +2921,7 @@ }, { "cell_type": "code", - "execution_count": 109, + "execution_count": 115, "metadata": { "collapsed": true }, @@ -2901,7 +2951,7 @@ }, { "cell_type": "code", - "execution_count": 110, + "execution_count": 116, "metadata": {}, "outputs": [], "source": [ @@ -2918,7 +2968,7 @@ }, { "cell_type": "code", - "execution_count": 111, + "execution_count": 117, "metadata": { "collapsed": true }, @@ -2934,7 +2984,7 @@ }, { "cell_type": "code", - "execution_count": 112, + "execution_count": 118, "metadata": {}, "outputs": [], "source": [ @@ -3010,8 +3060,10 @@ }, { "cell_type": "code", - "execution_count": 113, - "metadata": {}, + "execution_count": 119, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "from sklearn.base import BaseEstimator, ClassifierMixin\n", @@ -3127,15 +3179,14 @@ " self._graph = tf.Graph()\n", " with self._graph.as_default():\n", " self._build_graph(n_inputs, n_outputs)\n", + " # extra ops for batch normalization\n", + " extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)\n", "\n", " # needed in case of early stopping\n", " max_checks_without_progress = 20\n", " checks_without_progress = 0\n", " best_loss = np.infty\n", " best_params = None\n", - "\n", - " # extra ops for batch normalization\n", - " extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)\n", " \n", " # Now train the model!\n", " self._session = tf.Session(graph=self._graph)\n", @@ -3201,7 +3252,7 @@ }, { "cell_type": "code", - "execution_count": 114, + "execution_count": 120, "metadata": {}, "outputs": [], "source": [ @@ -3218,7 +3269,7 @@ }, { "cell_type": "code", - "execution_count": 115, + "execution_count": 121, "metadata": {}, "outputs": [], "source": [ @@ -3237,7 +3288,7 @@ }, { "cell_type": "code", - "execution_count": 116, + "execution_count": 122, "metadata": {}, "outputs": [], "source": [ @@ -3266,7 +3317,7 @@ }, { "cell_type": "code", - "execution_count": 117, + "execution_count": 123, "metadata": {}, "outputs": [], "source": [ @@ -3275,7 +3326,7 @@ }, { "cell_type": "code", - "execution_count": 118, + "execution_count": 124, "metadata": {}, "outputs": [], "source": [ @@ -3299,7 +3350,7 @@ }, { "cell_type": "code", - "execution_count": 119, + "execution_count": 125, "metadata": { "collapsed": true }, @@ -3331,7 +3382,7 @@ }, { "cell_type": "code", - "execution_count": 120, + "execution_count": 126, "metadata": {}, "outputs": [], "source": [ @@ -3356,7 +3407,7 @@ }, { "cell_type": "code", - "execution_count": 121, + "execution_count": 127, "metadata": {}, "outputs": [], "source": [ @@ -3373,7 +3424,7 @@ }, { "cell_type": "code", - "execution_count": 122, + "execution_count": 128, "metadata": {}, "outputs": [], "source": [ @@ -3387,12 +3438,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The best params are reached during epoch 2, that's much faster than earlier. Let's check the accuracy:" + "The best params are reached during epoch 48, that's actually a slower convergence than earlier. Let's check the accuracy:" ] }, { "cell_type": "code", - "execution_count": 123, + "execution_count": 129, "metadata": {}, "outputs": [], "source": [ @@ -3404,12 +3455,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Well, batch normalization did not improve accuracy, quite the contrary. Let's see if we can find a good set of hyperparameters that will work well with batch normalization:" + "Well, batch normalization did not improve accuracy. Let's see if we can find a good set of hyperparameters that will work well with batch normalization:" ] }, { "cell_type": "code", - "execution_count": 124, + "execution_count": 130, "metadata": {}, "outputs": [], "source": [ @@ -3434,7 +3485,7 @@ }, { "cell_type": "code", - "execution_count": 125, + "execution_count": 131, "metadata": {}, "outputs": [], "source": [ @@ -3443,7 +3494,7 @@ }, { "cell_type": "code", - "execution_count": 126, + "execution_count": 132, "metadata": {}, "outputs": [], "source": [ @@ -3455,7 +3506,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Oh well! Batch normalization did not help in this case. Let's see if dropout can do better." + "Slightly better than earlier: 99.4% vs 99.3%. Let's see if dropout can do better." ] }, { @@ -3476,12 +3527,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Since batch normalization did not help, let's go back to the best model we trained earlier and see how it performs on the training set:" + "Let's go back to the best model we trained earlier and see how it performs on the training set:" ] }, { "cell_type": "code", - "execution_count": 127, + "execution_count": 133, "metadata": {}, "outputs": [], "source": [ @@ -3498,7 +3549,7 @@ }, { "cell_type": "code", - "execution_count": 128, + "execution_count": 134, "metadata": {}, "outputs": [], "source": [ @@ -3524,7 +3575,7 @@ }, { "cell_type": "code", - "execution_count": 129, + "execution_count": 135, "metadata": {}, "outputs": [], "source": [ @@ -3541,7 +3592,7 @@ }, { "cell_type": "code", - "execution_count": 130, + "execution_count": 136, "metadata": {}, "outputs": [], "source": [ @@ -3566,7 +3617,7 @@ }, { "cell_type": "code", - "execution_count": 131, + "execution_count": 137, "metadata": {}, "outputs": [], "source": [ @@ -3575,7 +3626,7 @@ }, { "cell_type": "code", - "execution_count": 132, + "execution_count": 138, "metadata": {}, "outputs": [], "source": [ @@ -3587,14 +3638,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Oh well, neither batch normalization nor dropout improved the model. Better luck next time! :)" + "Oh well, dropout did not improve the model. Better luck next time! :)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "But that's okay, we have ourselves a nice DNN that achieves 99.32% accuracy on the test set. Now, let's see if some of its expertise on digits 0 to 4 can be transferred to the task of classifying digits 5 to 9." + "But that's okay, we have ourselves a nice DNN that achieves 99.40% accuracy on the test set using Batch Normalization, or 99.32% without BN. Let's see if some of this expertise on digits 0 to 4 can be transferred to the task of classifying digits 5 to 9. For the sake of simplicity we will reuse the DNN without BN, since it is almost as good." ] }, { @@ -3629,8 +3680,10 @@ }, { "cell_type": "code", - "execution_count": 133, - "metadata": {}, + "execution_count": 139, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "reset_graph()\n", @@ -3654,8 +3707,10 @@ }, { "cell_type": "code", - "execution_count": 134, - "metadata": {}, + "execution_count": 140, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "learning_rate = 0.01\n", @@ -3667,7 +3722,7 @@ }, { "cell_type": "code", - "execution_count": 135, + "execution_count": 141, "metadata": { "collapsed": true }, @@ -3703,7 +3758,7 @@ }, { "cell_type": "code", - "execution_count": 136, + "execution_count": 142, "metadata": { "collapsed": true }, @@ -3726,7 +3781,7 @@ }, { "cell_type": "code", - "execution_count": 137, + "execution_count": 143, "metadata": { "collapsed": true }, @@ -3745,8 +3800,10 @@ }, { "cell_type": "code", - "execution_count": 138, - "metadata": {}, + "execution_count": 144, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "X_train2, y_train2 = sample_n_instances_per_class(X_train2_full, y_train2_full, n=100)\n", @@ -3762,7 +3819,7 @@ }, { "cell_type": "code", - "execution_count": 139, + "execution_count": 145, "metadata": {}, "outputs": [], "source": [ @@ -3840,7 +3897,7 @@ }, { "cell_type": "code", - "execution_count": 140, + "execution_count": 146, "metadata": { "collapsed": true }, @@ -3858,7 +3915,7 @@ }, { "cell_type": "code", - "execution_count": 141, + "execution_count": 147, "metadata": {}, "outputs": [], "source": [ @@ -3932,8 +3989,10 @@ }, { "cell_type": "code", - "execution_count": 142, - "metadata": {}, + "execution_count": 148, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "reset_graph()\n", @@ -3963,8 +4022,10 @@ }, { "cell_type": "code", - "execution_count": 143, - "metadata": {}, + "execution_count": 149, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "learning_rate = 0.01\n", @@ -3986,7 +4047,7 @@ }, { "cell_type": "code", - "execution_count": 144, + "execution_count": 150, "metadata": {}, "outputs": [], "source": [ @@ -4048,7 +4109,7 @@ }, { "cell_type": "code", - "execution_count": 145, + "execution_count": 151, "metadata": { "collapsed": true }, @@ -4066,7 +4127,7 @@ }, { "cell_type": "code", - "execution_count": 146, + "execution_count": 152, "metadata": {}, "outputs": [], "source": [ @@ -4114,7 +4175,7 @@ }, { "cell_type": "code", - "execution_count": 147, + "execution_count": 153, "metadata": { "collapsed": true }, @@ -4131,7 +4192,7 @@ }, { "cell_type": "code", - "execution_count": 148, + "execution_count": 154, "metadata": {}, "outputs": [], "source": [ @@ -4179,7 +4240,7 @@ }, { "cell_type": "code", - "execution_count": 149, + "execution_count": 155, "metadata": {}, "outputs": [], "source": [ @@ -4189,7 +4250,7 @@ }, { "cell_type": "code", - "execution_count": 150, + "execution_count": 156, "metadata": {}, "outputs": [], "source": [ @@ -4242,8 +4303,10 @@ }, { "cell_type": "code", - "execution_count": 151, - "metadata": {}, + "execution_count": 157, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "n_inputs = 28 * 28 # MNIST\n", @@ -4263,7 +4326,7 @@ }, { "cell_type": "code", - "execution_count": 152, + "execution_count": 158, "metadata": { "collapsed": true }, @@ -4281,8 +4344,10 @@ }, { "cell_type": "code", - "execution_count": 153, - "metadata": {}, + "execution_count": 159, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "dnn1 = dnn(X1, name=\"DNN_A\")\n", @@ -4298,7 +4363,7 @@ }, { "cell_type": "code", - "execution_count": 154, + "execution_count": 160, "metadata": { "collapsed": true }, @@ -4316,7 +4381,7 @@ }, { "cell_type": "code", - "execution_count": 155, + "execution_count": 161, "metadata": {}, "outputs": [], "source": [ @@ -4325,7 +4390,7 @@ }, { "cell_type": "code", - "execution_count": 156, + "execution_count": 162, "metadata": {}, "outputs": [], "source": [ @@ -4341,7 +4406,7 @@ }, { "cell_type": "code", - "execution_count": 157, + "execution_count": 163, "metadata": {}, "outputs": [], "source": [ @@ -4357,7 +4422,7 @@ }, { "cell_type": "code", - "execution_count": 158, + "execution_count": 164, "metadata": { "collapsed": true }, @@ -4377,7 +4442,7 @@ }, { "cell_type": "code", - "execution_count": 159, + "execution_count": 165, "metadata": { "collapsed": true }, @@ -4395,8 +4460,10 @@ }, { "cell_type": "code", - "execution_count": 160, - "metadata": {}, + "execution_count": 166, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "y_as_float = tf.cast(y, tf.float32)\n", @@ -4413,8 +4480,10 @@ }, { "cell_type": "code", - "execution_count": 161, - "metadata": {}, + "execution_count": 167, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "learning_rate = 0.01\n", @@ -4433,8 +4502,10 @@ }, { "cell_type": "code", - "execution_count": 162, - "metadata": {}, + "execution_count": 168, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "y_pred_correct = tf.equal(y_pred, y)\n", @@ -4450,7 +4521,7 @@ }, { "cell_type": "code", - "execution_count": 163, + "execution_count": 169, "metadata": { "collapsed": true }, @@ -4477,7 +4548,7 @@ }, { "cell_type": "code", - "execution_count": 164, + "execution_count": 170, "metadata": { "collapsed": true }, @@ -4502,7 +4573,7 @@ }, { "cell_type": "code", - "execution_count": 165, + "execution_count": 171, "metadata": { "collapsed": true }, @@ -4538,8 +4609,10 @@ }, { "cell_type": "code", - "execution_count": 166, - "metadata": {}, + "execution_count": 172, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "batch_size = 5\n", @@ -4555,7 +4628,7 @@ }, { "cell_type": "code", - "execution_count": 167, + "execution_count": 173, "metadata": {}, "outputs": [], "source": [ @@ -4571,7 +4644,7 @@ }, { "cell_type": "code", - "execution_count": 168, + "execution_count": 174, "metadata": {}, "outputs": [], "source": [ @@ -4594,7 +4667,7 @@ }, { "cell_type": "code", - "execution_count": 169, + "execution_count": 175, "metadata": {}, "outputs": [], "source": [ @@ -4625,8 +4698,10 @@ }, { "cell_type": "code", - "execution_count": 170, - "metadata": {}, + "execution_count": 176, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "X_test1, y_test1 = generate_batch(X_test, y_test, batch_size=len(X_test))" @@ -4641,7 +4716,7 @@ }, { "cell_type": "code", - "execution_count": 171, + "execution_count": 177, "metadata": {}, "outputs": [], "source": [ @@ -4688,8 +4763,10 @@ }, { "cell_type": "code", - "execution_count": 172, - "metadata": {}, + "execution_count": 178, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "reset_graph()\n", @@ -4731,7 +4808,7 @@ }, { "cell_type": "code", - "execution_count": 173, + "execution_count": 179, "metadata": {}, "outputs": [], "source": [ @@ -4763,8 +4840,10 @@ }, { "cell_type": "code", - "execution_count": 174, - "metadata": {}, + "execution_count": 180, + "metadata": { + "collapsed": true + }, "outputs": [], "source": [ "reset_graph()\n", @@ -4798,7 +4877,7 @@ }, { "cell_type": "code", - "execution_count": 175, + "execution_count": 181, "metadata": {}, "outputs": [], "source": [