From e6cd00f9a41b0cc86e4bc53f8cdea317be885262 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Geron?= Date: Mon, 5 Jun 2017 18:48:03 +0200 Subject: [PATCH] Sync chapter 11 notebook with the code samples in that chapter --- 11_deep_learning.ipynb | 3016 ++++++++++++++++++++++++++++++++++------ 1 file changed, 2568 insertions(+), 448 deletions(-) diff --git a/11_deep_learning.ipynb b/11_deep_learning.ipynb index 74f0d00..90c320a 100644 --- a/11_deep_learning.ipynb +++ b/11_deep_learning.ipynb @@ -88,7 +88,7 @@ "editable": true }, "source": [ - "# Activation functions" + "# Vanishing/Exploding Gradients Problem" ] }, { @@ -134,6 +134,30 @@ "plt.show()" ] }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "## Xavier and He Initialization" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Note: the book uses `tensorflow.contrib.layers.fully_connected()` rather than `tf.layers.dense()` (which did not exist when this chapter was written). It is now preferable to use `tf.layers.dense()`, because anything in the contrib module may change or be deleted without notice. The `dense()` function is almost identical to the `fully_connected()` function. The main differences relevant to this chapter are:\n", + "* several parameters are renamed: `scope` becomes `name`, `activation_fn` becomes `activation` (and similarly the `_fn` suffix is removed from other parameters such as `normalizer_fn`), `weights_initializer` becomes `kernel_initializer`, etc.\n", + "* the default `activation` is now `None` rather than `tf.nn.relu`.\n", + "* it does not support `tensorflow.contrib.framework.arg_scope()` (introduced later in chapter 11).\n", + "* it does not support regularizer params (introduced later in chapter 11)." + ] + }, { "cell_type": "code", "execution_count": 4, @@ -143,6 +167,72 @@ "editable": true }, "outputs": [], + "source": [ + "import tensorflow as tf" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "tf.reset_default_graph()\n", + "\n", + "n_inputs = 28 * 28 # MNIST\n", + "n_hidden1 = 300\n", + "\n", + "X = tf.placeholder(tf.float32, shape=(None, n_inputs), name=\"X\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "he_init = tf.contrib.layers.variance_scaling_initializer()\n", + "hidden1 = tf.layers.dense(X, n_hidden1, activation=tf.nn.relu,\n", + " kernel_initializer=he_init, name=\"hidden1\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "## Nonsaturating Activation Functions" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Leaky ReLU" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, + "outputs": [], "source": [ "def leaky_relu(z, alpha=0.01):\n", " return np.maximum(alpha*z, z)" @@ -150,7 +240,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 8, "metadata": { "collapsed": false, "deletable": true, @@ -171,9 +261,231 @@ "plt.show()" ] }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Implementing Leaky ReLU in TensorFlow:" + ] + }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 9, + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "tf.reset_default_graph()\n", + "\n", + "X = tf.placeholder(tf.float32, shape=(None, n_inputs), name=\"X\")" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "def leaky_relu(z, name=None):\n", + " return tf.maximum(0.01 * z, z, name=name)\n", + "\n", + "hidden1 = tf.layers.dense(X, n_hidden1, activation=leaky_relu, name=\"hidden1\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Let's train a neural network on MNIST using the Leaky ReLU. First let's create the graph:" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "tf.reset_default_graph()\n", + "\n", + "n_inputs = 28 * 28 # MNIST\n", + "n_hidden1 = 300\n", + "n_hidden2 = 100\n", + "n_outputs = 10" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "X = tf.placeholder(tf.float32, shape=(None, n_inputs), name=\"X\")\n", + "y = tf.placeholder(tf.int64, shape=(None), name=\"y\")" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "with tf.name_scope(\"dnn\"):\n", + " hidden1 = tf.layers.dense(X, n_hidden1, activation=leaky_relu, name=\"hidden1\")\n", + " hidden2 = tf.layers.dense(hidden1, n_hidden2, activation=leaky_relu, name=\"hidden2\")\n", + " logits = tf.layers.dense(hidden2, n_outputs, name=\"outputs\")" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "with tf.name_scope(\"loss\"):\n", + " xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)\n", + " loss = tf.reduce_mean(xentropy, name=\"loss\")" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "learning_rate = 0.01\n", + "\n", + "with tf.name_scope(\"train\"):\n", + " optimizer = tf.train.GradientDescentOptimizer(learning_rate)\n", + " training_op = optimizer.minimize(loss)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "with tf.name_scope(\"eval\"):\n", + " correct = tf.nn.in_top_k(logits, y, 1)\n", + " accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "init = tf.global_variables_initializer()\n", + "saver = tf.train.Saver()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Let's load the data:" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "from tensorflow.examples.tutorials.mnist import input_data\n", + "mnist = input_data.read_data_sets(\"/tmp/data/\")" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true, + "scrolled": true + }, + "outputs": [], + "source": [ + "n_epochs = 40\n", + "batch_size = 50\n", + "\n", + "with tf.Session() as sess:\n", + " init.run()\n", + " for epoch in range(n_epochs):\n", + " for iteration in range(mnist.train.num_examples // batch_size):\n", + " X_batch, y_batch = mnist.train.next_batch(batch_size)\n", + " sess.run(training_op, feed_dict={X: X_batch, y: y_batch})\n", + " if epoch % 5 == 0:\n", + " acc_train = accuracy.eval(feed_dict={X: X_batch, y: y_batch})\n", + " acc_test = accuracy.eval(feed_dict={X: mnist.validation.images, y: mnist.validation.labels})\n", + " print(epoch, \"Batch accuracy:\", acc_train, \"Validation accuracy:\", acc_test)\n", + "\n", + " save_path = saver.save(sess, \"./my_model_final.ckpt\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### ELU" + ] + }, + { + "cell_type": "code", + "execution_count": 20, "metadata": { "collapsed": false, "deletable": true, @@ -187,7 +499,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 21, "metadata": { "collapsed": false, "deletable": true, @@ -208,9 +520,34 @@ "plt.show()" ] }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Implementing ELU in TensorFlow is trivial, just specify the activation function when building each layer:" + ] + }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 22, + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "tf.reset_default_graph()\n", + "\n", + "X = tf.placeholder(tf.float32, shape=(None, n_inputs), name=\"X\")" + ] + }, + { + "cell_type": "code", + "execution_count": 23, "metadata": { "collapsed": false, "deletable": true, @@ -218,13 +555,40 @@ }, "outputs": [], "source": [ - "from tensorflow.examples.tutorials.mnist import input_data\n", - "mnist = input_data.read_data_sets(\"/tmp/data/\")" + "hidden1 = tf.layers.dense(X, n_hidden1, activation=tf.nn.elu, name=\"hidden1\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "# Batch Normalization" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Note: the book uses `tensorflow.contrib.layers.batch_norm()` rather than `tf.layers.batch_normalization()` (which did not exist when this chapter was written). It is now preferable to use `tf.layers.batch_normalization()`, because anything in the contrib module may change or be deleted without notice. Instead of using the `batch_norm()` function as a regularizer parameter to the `fully_connected()` function, we now use `batch_normalization()` and we explicitly create a distinct layer. The parameters are a bit different, in particular:\n", + "* `decay` is renamed to `momentum`,\n", + "* `is_training` is renamed to `training`,\n", + "* `updates_collections` is removed: the update operations needed by batch normalization are added to the `UPDATE_OPS` collection and you need to explicity run these operations during training (see the execution phase below),\n", + "* we don't need to specify `scale=True`, as that is the default.\n", + "\n", + "Also note that in order to run batch norm just _before_ each hidden layer's activation function, we apply the ELU activation function manually, right after the batch norm layer.\n", + "\n", + "Note: since the `tf.layers.dense()` function is incompatible with `tf.contrib.layers.arg_scope()` (which is used in the book), we now use python's `functools.partial()` function instead. It makes it easy to create a `my_dense_layer()` function that just calls `tf.layers.dense()` with the desired parameters automatically set (unless they are overridden when calling `my_dense_layer()`). As you can see, the code remains very similar." ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 24, "metadata": { "collapsed": true, "deletable": true, @@ -232,13 +596,35 @@ }, "outputs": [], "source": [ - "def leaky_relu(z, name=None):\n", - " return tf.maximum(0.01 * z, z, name=name)" + "tf.reset_default_graph()\n", + "\n", + "import tensorflow as tf\n", + "\n", + "n_inputs = 28 * 28\n", + "n_hidden1 = 300\n", + "n_hidden2 = 100\n", + "n_outputs = 10\n", + "\n", + "X = tf.placeholder(tf.float32, shape=(None, n_inputs), name=\"X\")\n", + "\n", + "training = tf.placeholder_with_default(False, shape=(), name='training')\n", + "\n", + "hidden1 = tf.layers.dense(X, n_hidden1, name=\"hidden1\")\n", + "bn1 = tf.layers.batch_normalization(hidden1, training=training, momentum=0.9)\n", + "bn1_act = tf.nn.elu(bn1)\n", + "\n", + "hidden2 = tf.layers.dense(bn1_act, n_hidden2, name=\"hidden2\")\n", + "bn2 = tf.layers.batch_normalization(hidden2, training=training, momentum=0.9)\n", + "bn2_act = tf.nn.elu(bn2)\n", + "\n", + "logits_before_bn = tf.layers.dense(bn2_act, n_outputs, name=\"outputs\")\n", + "logits = tf.layers.batch_normalization(logits_before_bn, training=training,\n", + " momentum=0.9)" ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 25, "metadata": { "collapsed": true, "deletable": true, @@ -246,12 +632,497 @@ }, "outputs": [], "source": [ - "import tensorflow as tf" + "tf.reset_default_graph()\n", + "\n", + "X = tf.placeholder(tf.float32, shape=(None, n_inputs), name=\"X\")\n", + "training = tf.placeholder_with_default(False, shape=(), name='training')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "To avoid repeating the same parameters over and over again, we can use Python's `partial()` function:" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 26, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "from functools import partial\n", + "\n", + "my_batch_norm_layer = partial(tf.layers.batch_normalization,\n", + " training=training, momentum=0.9)\n", + "\n", + "hidden1 = tf.layers.dense(X, n_hidden1, name=\"hidden1\")\n", + "bn1 = my_batch_norm_layer(hidden1)\n", + "bn1_act = tf.nn.elu(bn1)\n", + "hidden2 = tf.layers.dense(bn1_act, n_hidden2, name=\"hidden2\")\n", + "bn2 = my_batch_norm_layer(hidden2)\n", + "bn2_act = tf.nn.elu(bn2)\n", + "logits_before_bn = tf.layers.dense(bn2_act, n_outputs, name=\"outputs\")\n", + "logits = my_batch_norm_layer(logits_before_bn)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Let's build a neural net for MNIST, using the ELU activation function and Batch Normalization at each layer:" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "tf.reset_default_graph()\n", + "\n", + "batch_norm_momentum = 0.9\n", + "\n", + "X = tf.placeholder(tf.float32, shape=(None, n_inputs), name=\"X\")\n", + "y = tf.placeholder(tf.int64, shape=(None), name=\"y\")\n", + "training = tf.placeholder_with_default(False, shape=(), name='training')\n", + "\n", + "with tf.name_scope(\"dnn\"):\n", + " he_init = tf.contrib.layers.variance_scaling_initializer()\n", + "\n", + " my_batch_norm_layer = partial(\n", + " tf.layers.batch_normalization,\n", + " training=training,\n", + " momentum=batch_norm_momentum)\n", + "\n", + " my_dense_layer = partial(\n", + " tf.layers.dense,\n", + " kernel_initializer=he_init)\n", + "\n", + " hidden1 = my_dense_layer(X, n_hidden1, name=\"hidden1\")\n", + " bn1 = tf.nn.elu(my_batch_norm_layer(hidden1))\n", + " hidden2 = my_dense_layer(bn1, n_hidden2, name=\"hidden2\")\n", + " bn2 = tf.nn.elu(my_batch_norm_layer(hidden2))\n", + " logits_before_bn = my_dense_layer(bn2, n_outputs, name=\"outputs\")\n", + " logits = my_batch_norm_layer(logits_before_bn)\n", + "\n", + "with tf.name_scope(\"loss\"):\n", + " xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)\n", + " loss = tf.reduce_mean(xentropy, name=\"loss\")\n", + "\n", + "with tf.name_scope(\"train\"):\n", + " optimizer = tf.train.GradientDescentOptimizer(learning_rate)\n", + " training_op = optimizer.minimize(loss)\n", + "\n", + "with tf.name_scope(\"eval\"):\n", + " correct = tf.nn.in_top_k(logits, y, 1)\n", + " accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))\n", + " \n", + "init = tf.global_variables_initializer()\n", + "saver = tf.train.Saver()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Note: since we are using `tf.layers.batch_normalization()` rather than `tf.contrib.layers.batch_norm()` (as in the book), we need to explicitly run the extra update operations needed by batch normalization (`sess.run([training_op, extra_update_ops],...`)." + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "n_epochs = 20\n", + "batch_size = 200" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)\n", + "\n", + "with tf.Session() as sess:\n", + " init.run()\n", + " for epoch in range(n_epochs):\n", + " for iteration in range(mnist.train.num_examples // batch_size):\n", + " X_batch, y_batch = mnist.train.next_batch(batch_size)\n", + " sess.run([training_op, extra_update_ops],\n", + " feed_dict={training: True, X: X_batch, y: y_batch})\n", + " accuracy_val = accuracy.eval(feed_dict={X: mnist.test.images,\n", + " y: mnist.test.labels})\n", + " print(epoch, \"Test accuracy:\", accuracy_val)\n", + "\n", + " save_path = saver.save(sess, \"./my_model_final.ckpt\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "What!? That's not a great accuracy for MNIST. Of course, if you train for longer it will get much better accuracy, but with such a shallow network, Batch Norm and ELU are unlikely to have very positive impact: they shine mostly for much deeper nets." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Note that you could also make the training operation depend on the update operations:\n", + "\n", + "```python\n", + "with tf.name_scope(\"train\"):\n", + " optimizer = tf.train.GradientDescentOptimizer(learning_rate)\n", + " extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)\n", + " with tf.control_dependencies(extra_update_ops):\n", + " training_op = optimizer.minimize(loss)\n", + "```\n", + "\n", + "This way, you would just have to evaluate the `training_op` during training, TensorFlow would automatically run the update operations as well:\n", + "\n", + "```python\n", + "sess.run(training_op, feed_dict={training: True, X: X_batch, y: y_batch})\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "One more thing: notice that the list of trainable variables is shorter than the list of all global variables. This is because the moving averages are non-trainable variables. If you want to reuse a pretrained neural network (see below), you must not forget these non-trainable variables." + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "[v.name for v in tf.trainable_variables()]" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "[v.name for v in tf.global_variables()]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "## Gradient Clipping" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Let's create a simple neural net for MNIST and add gradient clipping. The first part is the same as earlier (except we added a few more layers to demonstrate reusing pretrained models, see below):" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "tf.reset_default_graph()\n", + "\n", + "n_inputs = 28 * 28 # MNIST\n", + "n_hidden1 = 300\n", + "n_hidden2 = 50\n", + "n_hidden3 = 50\n", + "n_hidden4 = 50\n", + "n_hidden5 = 50\n", + "n_outputs = 10\n", + "\n", + "X = tf.placeholder(tf.float32, shape=(None, n_inputs), name=\"X\")\n", + "y = tf.placeholder(tf.int64, shape=(None), name=\"y\")\n", + "\n", + "with tf.name_scope(\"dnn\"):\n", + " hidden1 = tf.layers.dense(X, n_hidden1, activation=tf.nn.relu, name=\"hidden1\")\n", + " hidden2 = tf.layers.dense(hidden1, n_hidden2, activation=tf.nn.relu, name=\"hidden2\")\n", + " hidden3 = tf.layers.dense(hidden2, n_hidden3, activation=tf.nn.relu, name=\"hidden3\")\n", + " hidden4 = tf.layers.dense(hidden3, n_hidden4, activation=tf.nn.relu, name=\"hidden4\")\n", + " hidden5 = tf.layers.dense(hidden4, n_hidden5, activation=tf.nn.relu, name=\"hidden5\")\n", + " logits = tf.layers.dense(hidden5, n_outputs, name=\"outputs\")\n", + "\n", + "with tf.name_scope(\"loss\"):\n", + " xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)\n", + " loss = tf.reduce_mean(xentropy, name=\"loss\")" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "learning_rate = 0.01" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Now we apply gradient clipping. For this, we need to get the gradients, use the `clip_by_value()` function to clip them, then apply them:" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "threshold = 1.0\n", + "\n", + "optimizer = tf.train.GradientDescentOptimizer(learning_rate)\n", + "grads_and_vars = optimizer.compute_gradients(loss)\n", + "capped_gvs = [(tf.clip_by_value(grad, -threshold, threshold), var)\n", + " for grad, var in grads_and_vars]\n", + "training_op = optimizer.apply_gradients(capped_gvs)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "The rest is the same as usual:" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "with tf.name_scope(\"eval\"):\n", + " correct = tf.nn.in_top_k(logits, y, 1)\n", + " accuracy = tf.reduce_mean(tf.cast(correct, tf.float32), name=\"accuracy\")" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "init = tf.global_variables_initializer()\n", + "saver = tf.train.Saver()" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "n_epochs = 20\n", + "batch_size = 200" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "with tf.Session() as sess:\n", + " init.run()\n", + " for epoch in range(n_epochs):\n", + " for iteration in range(mnist.train.num_examples // batch_size):\n", + " X_batch, y_batch = mnist.train.next_batch(batch_size)\n", + " sess.run(training_op, feed_dict={X: X_batch, y: y_batch})\n", + " accuracy_val = accuracy.eval(feed_dict={X: mnist.test.images,\n", + " y: mnist.test.labels})\n", + " print(epoch, \"Test accuracy:\", accuracy_val)\n", + "\n", + " save_path = saver.save(sess, \"./my_model_final.ckpt\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "## Reusing Pretrained Layers" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "## Reusing a TensorFlow Model" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "First you need to load the graph's structure. The `import_meta_graph()` function does just that, loading the graph's operations into the default graph, and returning a `Saver` that you can then use to restore the model's state. Note that by default, a `Saver` saves the structure of the graph into a `.meta` file, so that's the file you should load:" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "tf.reset_default_graph()" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "saver = tf.train.import_meta_graph(\"./my_model_final.ckpt.meta\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Next you need to get a handle on all the operations you will need for training. If you don't know the graph's structure, you can list all the operations:" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "for op in tf.get_default_graph().get_operations():\n", + " print(op.name)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Oops, that's a lot of operations! It's much easier to use TensorBoard to visualize the graph. The following hack will allow you to visualize the graph within Jupyter (if it does not work with your browser, you will need to use a `FileWriter` to save the graph and then visualize it in TensorBoard):" + ] + }, + { + "cell_type": "code", + "execution_count": 42, "metadata": { "collapsed": true, "deletable": true, @@ -297,6 +1168,20 @@ " display(HTML(iframe))" ] }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true, + "scrolled": true + }, + "outputs": [], + "source": [ + "show_graph(tf.get_default_graph())" + ] + }, { "cell_type": "markdown", "metadata": { @@ -304,16 +1189,12 @@ "editable": true }, "source": [ - "Note: the book uses `tensorflow.contrib.layers.fully_connected()` rather than `tf.layers.dense()` (which did not exist when this chapter was written). It is now preferable to use `tf.layers.dense()`, because anything in the contrib module may change or be deleted without notice. The `dense()` function is almost identical to the `fully_connected()` function. The main differences relevant to this chapter are:\n", - "* several parameters are renamed: `scope` becomes `name`, `activation_fn` becomes `activation` (and similarly the `_fn` suffix is removed from other parameters such as `normalizer_fn`), `weights_initializer` becomes `kernel_initializer`, etc.\n", - "* the default `activation` is now `None` rather than `tf.nn.relu`.\n", - "* it does not support `tensorflow.contrib.framework.arg_scope()` (introduced later in chapter 11).\n", - "* it does not support regularizer params (introduced later in chapter 11)." + "Once you know which operations you need, you can get a handle on them using the graph's `get_operation_by_name()` or `get_tensor_by_name()` methods:" ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 44, "metadata": { "collapsed": false, "deletable": true, @@ -321,41 +1202,74 @@ }, "outputs": [], "source": [ - "tf.reset_default_graph()\n", + "X = tf.get_default_graph().get_tensor_by_name(\"X:0\")\n", + "y = tf.get_default_graph().get_tensor_by_name(\"y:0\")\n", "\n", - "n_inputs = 28*28 # MNIST\n", - "n_hidden1 = 300\n", - "n_hidden2 = 100\n", - "n_outputs = 10\n", - "learning_rate = 0.01\n", + "accuracy = tf.get_default_graph().get_tensor_by_name(\"eval/accuracy:0\")\n", "\n", - "X = tf.placeholder(tf.float32, shape=(None, n_inputs), name=\"X\")\n", - "y = tf.placeholder(tf.int64, shape=(None), name=\"y\")\n", - "\n", - "with tf.name_scope(\"dnn\"):\n", - " hidden1 = tf.layers.dense(X, n_hidden1, activation=leaky_relu, name=\"hidden1\")\n", - " hidden2 = tf.layers.dense(hidden1, n_hidden2, activation=leaky_relu, name=\"hidden2\")\n", - " logits = tf.layers.dense(hidden2, n_outputs, name=\"outputs\")\n", - "\n", - "with tf.name_scope(\"loss\"):\n", - " xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)\n", - " loss = tf.reduce_mean(xentropy, name=\"loss\")\n", - "\n", - "with tf.name_scope(\"train\"):\n", - " optimizer = tf.train.GradientDescentOptimizer(learning_rate)\n", - " training_op = optimizer.minimize(loss)\n", - "\n", - "with tf.name_scope(\"eval\"):\n", - " correct = tf.nn.in_top_k(logits, y, 1)\n", - " accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))\n", - " \n", - "init = tf.global_variables_initializer()\n", - "saver = tf.train.Saver()" + "training_op = tf.get_default_graph().get_operation_by_name(\"GradientDescent\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "If you are the author of the original model, you could make things easier for people who will reuse your model by giving operations very clear names and documenting them. Another approach is to create a collection containing all the important operations that people will want to get a handle on:" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 45, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "for op in (X, y, accuracy, training_op):\n", + " tf.add_to_collection(\"my_important_ops\", op)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "This way people who reuse your model will be able to simply write:" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "X, y, accuracy, training_op = tf.get_collection(\"my_important_ops\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Now you can start a session, restore the model's state and continue training on your data:" + ] + }, + { + "cell_type": "code", + "execution_count": 47, "metadata": { "collapsed": false, "deletable": true, @@ -363,20 +1277,43 @@ }, "outputs": [], "source": [ - "n_epochs = 20\n", - "batch_size = 100\n", - "\n", "with tf.Session() as sess:\n", - " init.run()\n", + " saver.restore(sess, \"./my_model_final.ckpt\")\n", + " # continue training the model..." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Actually, let's test this for real!" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "with tf.Session() as sess:\n", + " saver.restore(sess, \"./my_model_final.ckpt\")\n", + "\n", " for epoch in range(n_epochs):\n", - " for iteration in range(len(mnist.test.labels)//batch_size):\n", + " for iteration in range(mnist.train.num_examples // batch_size):\n", " X_batch, y_batch = mnist.train.next_batch(batch_size)\n", " sess.run(training_op, feed_dict={X: X_batch, y: y_batch})\n", - " acc_train = accuracy.eval(feed_dict={X: X_batch, y: y_batch})\n", - " acc_test = accuracy.eval(feed_dict={X: mnist.test.images, y: mnist.test.labels})\n", - " print(epoch, \"Train accuracy:\", acc_train, \"Test accuracy:\", acc_test)\n", + " accuracy_val = accuracy.eval(feed_dict={X: mnist.test.images,\n", + " y: mnist.test.labels})\n", + " print(epoch, \"Test accuracy:\", accuracy_val)\n", "\n", - " save_path = saver.save(sess, \"my_model_final.ckpt\")" + " save_path = saver.save(sess, \"./my_new_model_final.ckpt\") " ] }, { @@ -386,137 +1323,12 @@ "editable": true }, "source": [ - "# Batch Normalization" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, - "source": [ - "Note: the book uses `tensorflow.contrib.layers.batch_norm()` rather than `tf.layers.batch_normalization()` (which did not exist when this chapter was written). It is now preferable to use `tf.layers.batch_normalization()`, because anything in the contrib module may change or be deleted without notice. Instead of using the `batch_norm()` function as a regularizer parameter to the `fully_connected()` function, we now use `batch_normalization()` and we explicitly create a distinct layer. The parameters are a bit different, in particular:\n", - "* `decay` is renamed to `momentum`,\n", - "* `is_training` is renamed to `training`,\n", - "* `updates_collections` is removed: the update operations needed by batch normalization are added to the `UPDATE_OPS` collection and you need to explicity run these operations during training (see the execution phase below),\n", - "* we don't need to specify `scale=True`, as that is the default.\n", - "\n", - "Also note that in order to run batch norm just _before_ each hidden layer's activation function, we apply the ELU activation function manually, right after the batch norm layer.\n", - "\n", - "Note: since the `tf.layers.dense()` function is incompatible with `tf.contrib.layers.arg_scope()` (which is used in the book), we now use python's `functools.partial()` function instead. It makes it easy to create a `my_dense_layer()` function that just calls `tf.layers.dense()` with the desired parameters automatically set (unless they are overridden when calling `my_dense_layer()`). As you can see, the code remains very similar." + "Alternatively, if you have access to the Python code that built the original graph, you can use it instead of `import_meta_graph()`:" ] }, { "cell_type": "code", - "execution_count": 14, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, - "outputs": [], - "source": [ - "tf.reset_default_graph()\n", - "\n", - "from functools import partial\n", - "\n", - "n_inputs = 28 * 28 # MNIST\n", - "n_hidden1 = 300\n", - "n_hidden2 = 100\n", - "n_outputs = 10\n", - "learning_rate = 0.01\n", - "momentum = 0.25\n", - "\n", - "X = tf.placeholder(tf.float32, shape=(None, n_inputs), name=\"X\")\n", - "y = tf.placeholder(tf.int64, shape=(None), name=\"y\")\n", - "is_training = tf.placeholder(tf.bool, shape=(), name='is_training')\n", - "\n", - "with tf.name_scope(\"dnn\"):\n", - " he_init = tf.contrib.layers.variance_scaling_initializer()\n", - "\n", - " my_batch_norm_layer = partial(\n", - " tf.layers.batch_normalization,\n", - " training=is_training,\n", - " momentum=0.9)\n", - "\n", - " my_dense_layer = partial(\n", - " tf.layers.dense,\n", - " kernel_initializer=he_init)\n", - "\n", - " hidden1 = my_dense_layer(X, n_hidden1, name=\"hidden1\")\n", - " bn1 = tf.nn.elu(my_batch_norm_layer(hidden1))\n", - " hidden2 = my_dense_layer(bn1, n_hidden2, name=\"hidden2\")\n", - " bn2 = tf.nn.elu(my_batch_norm_layer(hidden2))\n", - " logits_before_bn = my_dense_layer(bn2, n_outputs, activation=None, name=\"outputs\")\n", - " logits = my_batch_norm_layer(logits_before_bn)\n", - " extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)\n", - "\n", - "with tf.name_scope(\"loss\"):\n", - " xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)\n", - " loss = tf.reduce_mean(xentropy, name=\"loss\")\n", - "\n", - "with tf.name_scope(\"train\"):\n", - " optimizer = tf.train.MomentumOptimizer(learning_rate, momentum)\n", - " training_op = optimizer.minimize(loss)\n", - "\n", - "with tf.name_scope(\"eval\"):\n", - " correct = tf.nn.in_top_k(logits, y, 1)\n", - " accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))\n", - " \n", - "init = tf.global_variables_initializer()\n", - "saver = tf.train.Saver()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, - "source": [ - "Note: since we are using `tf.layers.batch_normalization()` rather than `tf.contrib.layers.batch_norm()` (as in the book), we need to explicitly run the extra update operations needed by batch normalization (`sess.run([training_op, extra_update_ops],...`)." - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, - "outputs": [], - "source": [ - "n_epochs = 20\n", - "batch_size = 200\n", - "\n", - "with tf.Session() as sess:\n", - " init.run()\n", - " for epoch in range(n_epochs):\n", - " for iteration in range(len(mnist.test.labels)//batch_size):\n", - " X_batch, y_batch = mnist.train.next_batch(batch_size)\n", - " sess.run([training_op, extra_update_ops], feed_dict={is_training: True, X: X_batch, y: y_batch})\n", - " acc_train = accuracy.eval(feed_dict={is_training: False, X: X_batch, y: y_batch})\n", - " acc_test = accuracy.eval(feed_dict={is_training: False, X: mnist.test.images, y: mnist.test.labels})\n", - " print(epoch, \"Train accuracy:\", acc_train, \"Test accuracy:\", acc_test)\n", - "\n", - " save_path = saver.save(sess, \"my_model_final.ckpt\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, - "source": [ - "Now the same model with $\\ell_1$ regularization:" - ] - }, - { - "cell_type": "code", - "execution_count": 16, + "execution_count": 49, "metadata": { "collapsed": true, "deletable": true, @@ -526,90 +1338,45 @@ "source": [ "tf.reset_default_graph()\n", "\n", - "from functools import partial\n", + "n_inputs = 28 * 28 # MNIST\n", + "n_hidden1 = 300\n", + "n_hidden2 = 50\n", + "n_hidden3 = 50\n", + "n_hidden4 = 50\n", + "n_outputs = 10\n", "\n", "X = tf.placeholder(tf.float32, shape=(None, n_inputs), name=\"X\")\n", "y = tf.placeholder(tf.int64, shape=(None), name=\"y\")\n", - "is_training = tf.placeholder(tf.bool, shape=(), name='is_training')\n", "\n", "with tf.name_scope(\"dnn\"):\n", - " he_init = tf.contrib.layers.variance_scaling_initializer()\n", - "\n", - " my_batch_norm_layer = partial(\n", - " tf.layers.batch_normalization,\n", - " training=is_training,\n", - " momentum=0.9)\n", - "\n", - " my_dense_layer = partial(\n", - " tf.layers.dense,\n", - " kernel_initializer=he_init,\n", - " kernel_regularizer=tf.contrib.layers.l1_regularizer(0.01))\n", - "\n", - " hidden1 = my_dense_layer(X, n_hidden1, name=\"hidden1\")\n", - " bn1 = tf.nn.elu(my_batch_norm_layer(hidden1))\n", - " hidden2 = my_dense_layer(bn1, n_hidden2, name=\"hidden2\")\n", - " bn2 = tf.nn.elu(my_batch_norm_layer(hidden2))\n", - " logits_before_bn = my_dense_layer(bn2, n_outputs, activation=None, name=\"outputs\")\n", - " logits = my_batch_norm_layer(logits_before_bn)\n", - " extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)\n", + " hidden1 = tf.layers.dense(X, n_hidden1, activation=tf.nn.relu, name=\"hidden1\")\n", + " hidden2 = tf.layers.dense(hidden1, n_hidden2, activation=tf.nn.relu, name=\"hidden2\")\n", + " hidden3 = tf.layers.dense(hidden2, n_hidden3, activation=tf.nn.relu, name=\"hidden3\")\n", + " hidden4 = tf.layers.dense(hidden3, n_hidden4, activation=tf.nn.relu, name=\"hidden4\")\n", + " hidden5 = tf.layers.dense(hidden4, n_hidden5, activation=tf.nn.relu, name=\"hidden5\")\n", + " logits = tf.layers.dense(hidden5, n_outputs, name=\"outputs\")\n", "\n", "with tf.name_scope(\"loss\"):\n", " xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)\n", - " reg_losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)\n", - " base_loss = tf.reduce_mean(xentropy, name=\"base_loss\")\n", - " loss = tf.add_n([base_loss] + reg_losses, name=\"loss\")\n", - "\n", - "with tf.name_scope(\"train\"):\n", - " optimizer = tf.train.MomentumOptimizer(learning_rate, momentum)\n", - " training_op = optimizer.minimize(loss)\n", + " loss = tf.reduce_mean(xentropy, name=\"loss\")\n", "\n", "with tf.name_scope(\"eval\"):\n", " correct = tf.nn.in_top_k(logits, y, 1)\n", - " accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))\n", + " accuracy = tf.reduce_mean(tf.cast(correct, tf.float32), name=\"accuracy\")\n", + "\n", + "learning_rate = 0.01\n", + "threshold = 1.0\n", + "\n", + "optimizer = tf.train.GradientDescentOptimizer(learning_rate)\n", + "grads_and_vars = optimizer.compute_gradients(loss)\n", + "capped_gvs = [(tf.clip_by_value(grad, -threshold, threshold), var)\n", + " for grad, var in grads_and_vars]\n", + "training_op = optimizer.apply_gradients(capped_gvs)\n", "\n", "init = tf.global_variables_initializer()\n", "saver = tf.train.Saver()" ] }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, - "outputs": [], - "source": [ - "n_epochs = 20\n", - "batch_size = 200\n", - "\n", - "with tf.Session() as sess:\n", - " init.run()\n", - " for epoch in range(n_epochs):\n", - " for iteration in range(len(mnist.test.labels)//batch_size):\n", - " X_batch, y_batch = mnist.train.next_batch(batch_size)\n", - " sess.run([training_op, extra_update_ops], feed_dict={is_training: True, X: X_batch, y: y_batch})\n", - " acc_train = accuracy.eval(feed_dict={is_training: False, X: X_batch, y: y_batch})\n", - " acc_test = accuracy.eval(feed_dict={is_training: False, X: mnist.test.images, y: mnist.test.labels})\n", - " print(epoch, \"Train accuracy:\", acc_train, \"Test accuracy:\", acc_test)\n", - "\n", - " save_path = saver.save(sess, \"my_model_final.ckpt\")" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, - "outputs": [], - "source": [ - "[v.name for v in tf.global_variables()]" - ] - }, { "cell_type": "markdown", "metadata": { @@ -617,12 +1384,12 @@ "editable": true }, "source": [ - "Note: the weights variable created by the `tf.layers.dense()` function is called `\"kernel\"` (instead of `\"weights\"` when using the `tf.contrib.layers.fully_connected()`, as in the book):" + "And continue training:" ] }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 50, "metadata": { "collapsed": false, "deletable": true, @@ -630,119 +1397,33 @@ }, "outputs": [], "source": [ - "with tf.variable_scope(\"\", default_name=\"\", reuse=True): # root scope\n", - " weights1 = tf.get_variable(\"hidden1/kernel\")\n", - " weights2 = tf.get_variable(\"hidden2/kernel\")\n", - " " - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, - "outputs": [], - "source": [ - "tf.reset_default_graph()\n", - "\n", - "x = tf.constant([0., 0., 3., 4., 30., 40., 300., 400.], shape=(4, 2))\n", - "c = tf.clip_by_norm(x, clip_norm=10)\n", - "c0 = tf.clip_by_norm(x, clip_norm=350, axes=0)\n", - "c1 = tf.clip_by_norm(x, clip_norm=10, axes=1)\n", - "\n", "with tf.Session() as sess:\n", - " xv = x.eval()\n", - " cv = c.eval()\n", - " c0v = c0.eval()\n", - " c1v = c1.eval()\n", + " saver.restore(sess, \"./my_model_final.ckpt\")\n", "\n", - "print(xv)" + " for epoch in range(n_epochs):\n", + " for iteration in range(mnist.train.num_examples // batch_size):\n", + " X_batch, y_batch = mnist.train.next_batch(batch_size)\n", + " sess.run(training_op, feed_dict={X: X_batch, y: y_batch})\n", + " accuracy_val = accuracy.eval(feed_dict={X: mnist.test.images,\n", + " y: mnist.test.labels})\n", + " print(epoch, \"Test accuracy:\", accuracy_val)\n", + "\n", + " save_path = saver.save(sess, \"./my_new_model_final.ckpt\") " ] }, { - "cell_type": "code", - "execution_count": 21, + "cell_type": "markdown", "metadata": { - "collapsed": false, "deletable": true, "editable": true }, - "outputs": [], "source": [ - "print(cv)" + "In general you will want to reuse only the lower layers. If you are using `import_meta_graph()` it will load the whole graph, but you can simply ignore the parts you do not need. In this example, we add a new 4th hidden layer on top of the pretrained 3rd layer (ignoring the old 4th hidden layer). We also build a new output layer, the loss for this new output, and a new optimizer to minimize it. We also need another saver to save the whole graph (containing both the entire old graph plus the new operations), and an initialization operation to initialize all the new variables:" ] }, { "cell_type": "code", - "execution_count": 22, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, - "outputs": [], - "source": [ - "print(np.linalg.norm(cv))" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, - "outputs": [], - "source": [ - "print(c0v)" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, - "outputs": [], - "source": [ - "print(np.linalg.norm(c0v, axis=0))" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, - "outputs": [], - "source": [ - "print(c1v)" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, - "outputs": [], - "source": [ - "print(np.linalg.norm(c1v, axis=1))" - ] - }, - { - "cell_type": "code", - "execution_count": 27, + "execution_count": 51, "metadata": { "collapsed": false, "deletable": true, @@ -752,55 +1433,658 @@ "source": [ "tf.reset_default_graph()\n", "\n", - "from functools import partial\n", + "n_hidden4 = 20 # new layer\n", + "n_outputs = 10 # new layer\n", + "\n", + "saver = tf.train.import_meta_graph(\"./my_model_final.ckpt.meta\")\n", + "\n", + "X = tf.get_default_graph().get_tensor_by_name(\"X:0\")\n", + "y = tf.get_default_graph().get_tensor_by_name(\"y:0\")\n", + "\n", + "hidden3 = tf.get_default_graph().get_tensor_by_name(\"dnn/hidden4/Relu:0\")\n", + "\n", + "new_hidden4 = tf.layers.dense(hidden3, n_hidden4, activation=tf.nn.relu, name=\"new_hidden4\")\n", + "new_logits = tf.layers.dense(new_hidden4, n_outputs, name=\"new_outputs\")\n", + "\n", + "with tf.name_scope(\"new_loss\"):\n", + " xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=new_logits)\n", + " loss = tf.reduce_mean(xentropy, name=\"loss\")\n", + "\n", + "with tf.name_scope(\"new_eval\"):\n", + " correct = tf.nn.in_top_k(new_logits, y, 1)\n", + " accuracy = tf.reduce_mean(tf.cast(correct, tf.float32), name=\"accuracy\")\n", + "\n", + "with tf.name_scope(\"new_train\"):\n", + " optimizer = tf.train.GradientDescentOptimizer(learning_rate)\n", + " training_op = optimizer.minimize(loss)\n", + "\n", + "init = tf.global_variables_initializer()\n", + "new_saver = tf.train.Saver()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "And we can train this new model:" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "with tf.Session() as sess:\n", + " init.run()\n", + " saver.restore(sess, \"./my_model_final.ckpt\")\n", + "\n", + " for epoch in range(n_epochs):\n", + " for iteration in range(mnist.train.num_examples // batch_size):\n", + " X_batch, y_batch = mnist.train.next_batch(batch_size)\n", + " sess.run(training_op, feed_dict={X: X_batch, y: y_batch})\n", + " accuracy_val = accuracy.eval(feed_dict={X: mnist.test.images,\n", + " y: mnist.test.labels})\n", + " print(epoch, \"Test accuracy:\", accuracy_val)\n", + "\n", + " save_path = new_saver.save(sess, \"./my_new_model_final.ckpt\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "If you have access to the Python code that built the original graph, you can just reuse the parts you need and drop the rest:" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "tf.reset_default_graph()\n", + "\n", + "n_inputs = 28 * 28 # MNIST\n", + "n_hidden1 = 300 # reused\n", + "n_hidden2 = 50 # reused\n", + "n_hidden3 = 50 # reused\n", + "n_hidden4 = 20 # new!\n", + "n_outputs = 10 # new!\n", "\n", "X = tf.placeholder(tf.float32, shape=(None, n_inputs), name=\"X\")\n", "y = tf.placeholder(tf.int64, shape=(None), name=\"y\")\n", - "is_training = tf.placeholder(tf.bool, shape=(), name='is_training')\n", - "\n", - "def max_norm_regularizer(threshold, axes=1, name=\"max_norm\", collection=\"max_norm\"):\n", - " def max_norm(weights):\n", - " clip_weights = tf.assign(weights, tf.clip_by_norm(weights, clip_norm=threshold, axes=axes), name=name)\n", - " tf.add_to_collection(collection, clip_weights)\n", - " return None # there is no regularization loss term\n", - " return max_norm\n", "\n", "with tf.name_scope(\"dnn\"):\n", - " \n", - " my_dense_layer = partial(\n", - " tf.layers.dense,\n", - " activation=tf.nn.relu,\n", - " kernel_regularizer=max_norm_regularizer(1.5))\n", + " hidden1 = tf.layers.dense(X, n_hidden1, activation=tf.nn.relu, name=\"hidden1\") # reused\n", + " hidden2 = tf.layers.dense(hidden1, n_hidden2, activation=tf.nn.relu, name=\"hidden2\") # reused\n", + " hidden3 = tf.layers.dense(hidden2, n_hidden3, activation=tf.nn.relu, name=\"hidden3\") # reused\n", + " hidden4 = tf.layers.dense(hidden3, n_hidden4, activation=tf.nn.relu, name=\"hidden4\") # new!\n", + " logits = tf.layers.dense(hidden4, n_outputs, name=\"outputs\") # new!\n", "\n", - " hidden1 = my_dense_layer(X, n_hidden1, name=\"hidden1\")\n", - " hidden2 = my_dense_layer(hidden1, n_hidden2, name=\"hidden2\")\n", - " logits = my_dense_layer(hidden2, n_outputs, activation=None, name=\"outputs\")\n", - "\n", - "clip_all_weights = tf.get_collection(\"max_norm\")\n", - " \n", "with tf.name_scope(\"loss\"):\n", " xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)\n", " loss = tf.reduce_mean(xentropy, name=\"loss\")\n", "\n", + "with tf.name_scope(\"eval\"):\n", + " correct = tf.nn.in_top_k(logits, y, 1)\n", + " accuracy = tf.reduce_mean(tf.cast(correct, tf.float32), name=\"accuracy\")\n", + "\n", "with tf.name_scope(\"train\"):\n", - " optimizer = tf.train.MomentumOptimizer(learning_rate, momentum)\n", - " threshold = 1.0\n", - " grads_and_vars = optimizer.compute_gradients(loss)\n", - " capped_gvs = [(tf.clip_by_value(grad, -threshold, threshold), var)\n", - " for grad, var in grads_and_vars]\n", - " training_op = optimizer.apply_gradients(capped_gvs)\n", + " optimizer = tf.train.GradientDescentOptimizer(learning_rate)\n", + " training_op = optimizer.minimize(loss)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "However, you must create one `Saver` to restore the pretrained model (giving it the list of variables to restore, or else it will complain that the graphs don't match), and another `Saver` to save the new model, once it is trained:" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "reuse_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,\n", + " scope=\"hidden[123]\") # regular expression\n", + "reuse_vars_dict = dict([(var.op.name, var) for var in reuse_vars])\n", + "restore_saver = tf.train.Saver(reuse_vars_dict) # to restore layers 1-3\n", + "\n", + "init = tf.global_variables_initializer()\n", + "saver = tf.train.Saver()\n", + "\n", + "with tf.Session() as sess:\n", + " init.run()\n", + " restore_saver.restore(sess, \"./my_model_final.ckpt\")\n", + "\n", + " for epoch in range(n_epochs): # not shown in the book\n", + " for iteration in range(mnist.train.num_examples // batch_size): # not shown\n", + " X_batch, y_batch = mnist.train.next_batch(batch_size) # not shown\n", + " sess.run(training_op, feed_dict={X: X_batch, y: y_batch}) # not shown\n", + " accuracy_val = accuracy.eval(feed_dict={X: mnist.test.images, # not shown\n", + " y: mnist.test.labels}) # not shown\n", + " print(epoch, \"Test accuracy:\", accuracy_val) # not shown\n", + "\n", + " save_path = saver.save(sess, \"./my_new_model_final.ckpt\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "## Reusing Models from Other Frameworks" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "In this example, for each variable we want to reuse, we find its initializer's assignment operation, and we get its second input, which corresponds to the initialization value. When we run the initializer, we replace the initialization values with the ones we want, using a `feed_dict`:" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "tf.reset_default_graph()\n", + "\n", + "n_inputs = 2\n", + "n_hidden1 = 3" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "original_w = [[1., 2., 3.], [4., 5., 6.]] # Load the weights from the other framework\n", + "original_b = [7., 8., 9.] # Load the biases from the other framework\n", + "\n", + "X = tf.placeholder(tf.float32, shape=(None, n_inputs), name=\"X\")\n", + "hidden1 = tf.layers.dense(X, n_hidden1, activation=tf.nn.relu, name=\"hidden1\")\n", + "# [...] Build the rest of the model\n", + "\n", + "# Get a handle on the assignment nodes for the hidden1 variables\n", + "graph = tf.get_default_graph()\n", + "assign_kernel = graph.get_operation_by_name(\"hidden1/kernel/Assign\")\n", + "assign_bias = graph.get_operation_by_name(\"hidden1/bias/Assign\")\n", + "init_kernel = assign_kernel.inputs[1]\n", + "init_bias = assign_bias.inputs[1]\n", + "\n", + "init = tf.global_variables_initializer()\n", + "\n", + "with tf.Session() as sess:\n", + " sess.run(init, feed_dict={init_kernel: original_w, init_bias: original_b})\n", + " # [...] Train the model on your new task\n", + " print(hidden1.eval(feed_dict={X: [[10.0, 11.0]]})) # not shown in the book" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Note: the weights variable created by the `tf.layers.dense()` function is called `\"kernel\"` (instead of `\"weights\"` when using the `tf.contrib.layers.fully_connected()`, as in the book), and the biases variable is called `bias` instead of `biases`." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Another approach (initially used in the book) would be to create dedicated assignment nodes and dedicated placeholders. This is more verbose and less efficient, but you may find this more explicit:" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "tf.reset_default_graph()\n", + "\n", + "n_inputs = 2\n", + "n_hidden1 = 3\n", + "\n", + "original_w = [[1., 2., 3.], [4., 5., 6.]] # Load the weights from the other framework\n", + "original_b = [7., 8., 9.] # Load the biases from the other framework\n", + "\n", + "X = tf.placeholder(tf.float32, shape=(None, n_inputs), name=\"X\")\n", + "hidden1 = tf.layers.dense(X, n_hidden1, activation=tf.nn.relu, name=\"hidden1\")\n", + "# [...] Build the rest of the model\n", + "\n", + "# Get a handle on the variables of layer hidden1\n", + "with tf.variable_scope(\"\", default_name=\"\", reuse=True): # root scope\n", + " hidden1_weights = tf.get_variable(\"hidden1/kernel\")\n", + " hidden1_biases = tf.get_variable(\"hidden1/bias\")\n", + "\n", + "# Create dedicated placeholders and assignment nodes\n", + "original_weights = tf.placeholder(tf.float32, shape=(n_inputs, n_hidden1))\n", + "original_biases = tf.placeholder(tf.float32, shape=n_hidden1)\n", + "assign_hidden1_weights = tf.assign(hidden1_weights, original_weights)\n", + "assign_hidden1_biases = tf.assign(hidden1_biases, original_biases)\n", + "\n", + "init = tf.global_variables_initializer()\n", + "\n", + "with tf.Session() as sess:\n", + " sess.run(init)\n", + " sess.run(assign_hidden1_weights, feed_dict={original_weights: original_w})\n", + " sess.run(assign_hidden1_biases, feed_dict={original_biases: original_b})\n", + " # [...] Train the model on your new task\n", + " print(hidden1.eval(feed_dict={X: [[10.0, 11.0]]}))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Note that we could also get a handle on the variables using `get_collection()` and specifying the `scope`:" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=\"hidden1\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Or we could use the graph's `get_tensor_by_name()` method:" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "tf.get_default_graph().get_tensor_by_name(\"hidden1/kernel:0\")" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "tf.get_default_graph().get_tensor_by_name(\"hidden1/bias:0\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Freezing the Lower Layers" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "tf.reset_default_graph()\n", + "\n", + "n_inputs = 28 * 28 # MNIST\n", + "n_hidden1 = 300 # reused\n", + "n_hidden2 = 50 # reused\n", + "n_hidden3 = 50 # reused\n", + "n_hidden4 = 20 # new!\n", + "n_outputs = 10 # new!\n", + "\n", + "X = tf.placeholder(tf.float32, shape=(None, n_inputs), name=\"X\")\n", + "y = tf.placeholder(tf.int64, shape=(None), name=\"y\")\n", + "\n", + "with tf.name_scope(\"dnn\"):\n", + " hidden1 = tf.layers.dense(X, n_hidden1, activation=tf.nn.relu, name=\"hidden1\") # reused\n", + " hidden2 = tf.layers.dense(hidden1, n_hidden2, activation=tf.nn.relu, name=\"hidden2\") # reused\n", + " hidden3 = tf.layers.dense(hidden2, n_hidden3, activation=tf.nn.relu, name=\"hidden3\") # reused\n", + " hidden4 = tf.layers.dense(hidden3, n_hidden4, activation=tf.nn.relu, name=\"hidden4\") # new!\n", + " logits = tf.layers.dense(hidden4, n_outputs, name=\"outputs\") # new!\n", + "\n", + "with tf.name_scope(\"loss\"):\n", + " xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)\n", + " loss = tf.reduce_mean(xentropy, name=\"loss\")\n", "\n", "with tf.name_scope(\"eval\"):\n", " correct = tf.nn.in_top_k(logits, y, 1)\n", - " accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))\n", - " \n", + " accuracy = tf.reduce_mean(tf.cast(correct, tf.float32), name=\"accuracy\")" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "with tf.name_scope(\"train\"): # not shown in the book\n", + " optimizer = tf.train.GradientDescentOptimizer(learning_rate) # not shown\n", + " train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,\n", + " scope=\"hidden[34]|outputs\")\n", + " training_op = optimizer.minimize(loss, var_list=train_vars)" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "init = tf.global_variables_initializer()\n", + "new_saver = tf.train.Saver()" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "reuse_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,\n", + " scope=\"hidden[123]\") # regular expression\n", + "reuse_vars_dict = dict([(var.op.name, var) for var in reuse_vars])\n", + "restore_saver = tf.train.Saver(reuse_vars_dict) # to restore layers 1-3\n", + "\n", + "init = tf.global_variables_initializer()\n", + "saver = tf.train.Saver()\n", + "\n", + "with tf.Session() as sess:\n", + " init.run()\n", + " restore_saver.restore(sess, \"./my_model_final.ckpt\")\n", + "\n", + " for epoch in range(n_epochs):\n", + " for iteration in range(mnist.train.num_examples // batch_size):\n", + " X_batch, y_batch = mnist.train.next_batch(batch_size)\n", + " sess.run(training_op, feed_dict={X: X_batch, y: y_batch})\n", + " accuracy_val = accuracy.eval(feed_dict={X: mnist.test.images,\n", + " y: mnist.test.labels})\n", + " print(epoch, \"Test accuracy:\", accuracy_val)\n", + "\n", + " save_path = saver.save(sess, \"./my_new_model_final.ckpt\")" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "tf.reset_default_graph()\n", + "\n", + "n_inputs = 28 * 28 # MNIST\n", + "n_hidden1 = 300 # reused\n", + "n_hidden2 = 50 # reused\n", + "n_hidden3 = 50 # reused\n", + "n_hidden4 = 20 # new!\n", + "n_outputs = 10 # new!\n", + "\n", + "X = tf.placeholder(tf.float32, shape=(None, n_inputs), name=\"X\")\n", + "y = tf.placeholder(tf.int64, shape=(None), name=\"y\")" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "with tf.name_scope(\"dnn\"):\n", + " hidden1 = tf.layers.dense(X, n_hidden1, activation=tf.nn.relu,\n", + " name=\"hidden1\") # reused frozen\n", + " hidden2 = tf.layers.dense(hidden1, n_hidden2, activation=tf.nn.relu,\n", + " name=\"hidden2\") # reused frozen\n", + " hidden2_stop = tf.stop_gradient(hidden2)\n", + " hidden3 = tf.layers.dense(hidden2_stop, n_hidden3, activation=tf.nn.relu,\n", + " name=\"hidden3\") # reused, not frozen\n", + " hidden4 = tf.layers.dense(hidden3, n_hidden4, activation=tf.nn.relu,\n", + " name=\"hidden4\") # new!\n", + " logits = tf.layers.dense(hidden4, n_outputs, name=\"outputs\") # new!" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "with tf.name_scope(\"loss\"):\n", + " xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)\n", + " loss = tf.reduce_mean(xentropy, name=\"loss\")\n", + "\n", + "with tf.name_scope(\"eval\"):\n", + " correct = tf.nn.in_top_k(logits, y, 1)\n", + " accuracy = tf.reduce_mean(tf.cast(correct, tf.float32), name=\"accuracy\")\n", + "\n", + "with tf.name_scope(\"train\"):\n", + " optimizer = tf.train.GradientDescentOptimizer(learning_rate)\n", + " training_op = optimizer.minimize(loss)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "The training code is exactly the same as earlier:" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "reuse_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,\n", + " scope=\"hidden[123]\") # regular expression\n", + "reuse_vars_dict = dict([(var.op.name, var) for var in reuse_vars])\n", + "restore_saver = tf.train.Saver(reuse_vars_dict) # to restore layers 1-3\n", + "\n", + "init = tf.global_variables_initializer()\n", + "saver = tf.train.Saver()\n", + "\n", + "with tf.Session() as sess:\n", + " init.run()\n", + " restore_saver.restore(sess, \"./my_model_final.ckpt\")\n", + "\n", + " for epoch in range(n_epochs):\n", + " for iteration in range(mnist.train.num_examples // batch_size):\n", + " X_batch, y_batch = mnist.train.next_batch(batch_size)\n", + " sess.run(training_op, feed_dict={X: X_batch, y: y_batch})\n", + " accuracy_val = accuracy.eval(feed_dict={X: mnist.test.images,\n", + " y: mnist.test.labels})\n", + " print(epoch, \"Test accuracy:\", accuracy_val)\n", + "\n", + " save_path = saver.save(sess, \"./my_new_model_final.ckpt\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Caching the Frozen Layers" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "tf.reset_default_graph()\n", + "\n", + "n_inputs = 28 * 28 # MNIST\n", + "n_hidden1 = 300 # reused\n", + "n_hidden2 = 50 # reused\n", + "n_hidden3 = 50 # reused\n", + "n_hidden4 = 20 # new!\n", + "n_outputs = 10 # new!\n", + "\n", + "X = tf.placeholder(tf.float32, shape=(None, n_inputs), name=\"X\")\n", + "y = tf.placeholder(tf.int64, shape=(None), name=\"y\")\n", + "\n", + "with tf.name_scope(\"dnn\"):\n", + " hidden1 = tf.layers.dense(X, n_hidden1, activation=tf.nn.relu,\n", + " name=\"hidden1\") # reused frozen\n", + " hidden2 = tf.layers.dense(hidden1, n_hidden2, activation=tf.nn.relu,\n", + " name=\"hidden2\") # reused frozen & cached\n", + " hidden2_stop = tf.stop_gradient(hidden2)\n", + " hidden3 = tf.layers.dense(hidden2_stop, n_hidden3, activation=tf.nn.relu,\n", + " name=\"hidden3\") # reused, not frozen\n", + " hidden4 = tf.layers.dense(hidden3, n_hidden4, activation=tf.nn.relu,\n", + " name=\"hidden4\") # new!\n", + " logits = tf.layers.dense(hidden4, n_outputs, name=\"outputs\") # new!\n", + "\n", + "with tf.name_scope(\"loss\"):\n", + " xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)\n", + " loss = tf.reduce_mean(xentropy, name=\"loss\")\n", + "\n", + "with tf.name_scope(\"eval\"):\n", + " correct = tf.nn.in_top_k(logits, y, 1)\n", + " accuracy = tf.reduce_mean(tf.cast(correct, tf.float32), name=\"accuracy\")\n", + "\n", + "with tf.name_scope(\"train\"):\n", + " optimizer = tf.train.GradientDescentOptimizer(learning_rate)\n", + " training_op = optimizer.minimize(loss)" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "reuse_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,\n", + " scope=\"hidden[123]\") # regular expression\n", + "reuse_vars_dict = dict([(var.op.name, var) for var in reuse_vars])\n", + "restore_saver = tf.train.Saver(reuse_vars_dict) # to restore layers 1-3\n", + "\n", "init = tf.global_variables_initializer()\n", "saver = tf.train.Saver()" ] }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 71, "metadata": { "collapsed": false, "deletable": true, @@ -808,26 +2092,331 @@ }, "outputs": [], "source": [ - "n_epochs = 20\n", + "import numpy as np\n", + "\n", + "n_batches = mnist.train.num_examples // batch_size\n", + "\n", + "with tf.Session() as sess:\n", + " init.run()\n", + " restore_saver.restore(sess, \"./my_model_final.ckpt\")\n", + " \n", + " h2_cache = sess.run(hidden2, feed_dict={X: mnist.train.images})\n", + " h2_cache_test = sess.run(hidden2, feed_dict={X: mnist.test.images}) # not shown in the book\n", + "\n", + " for epoch in range(n_epochs):\n", + " shuffled_idx = rnd.permutation(mnist.train.num_examples)\n", + " hidden2_batches = np.array_split(h2_cache[shuffled_idx], n_batches)\n", + " y_batches = np.array_split(mnist.train.labels[shuffled_idx], n_batches)\n", + " for hidden2_batch, y_batch in zip(hidden2_batches, y_batches):\n", + " sess.run(training_op, feed_dict={hidden2:hidden2_batch, y:y_batch})\n", + "\n", + " accuracy_val = accuracy.eval(feed_dict={hidden2: h2_cache_test, # not shown\n", + " y: mnist.test.labels}) # not shown\n", + " print(epoch, \"Test accuracy:\", accuracy_val) # not shown\n", + "\n", + " save_path = saver.save(sess, \"./my_new_model_final.ckpt\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "# Faster Optimizers" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "## Momentum optimization" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate,\n", + " momentum=0.9)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "## Nesterov Accelerated Gradient" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate,\n", + " momentum=0.9, use_nesterov=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "## AdaGrad" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "optimizer = tf.train.AdagradOptimizer(learning_rate=learning_rate)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "## RMSProp" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "optimizer = tf.train.RMSPropOptimizer(learning_rate=learning_rate,\n", + " momentum=0.9, decay=0.9, epsilon=1e-10)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "## Adam Optimization" + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "## Learning Rate Scheduling" + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "tf.reset_default_graph()\n", + "\n", + "n_inputs = 28 * 28 # MNIST\n", + "n_hidden1 = 300\n", + "n_hidden2 = 50\n", + "n_outputs = 10\n", + "\n", + "X = tf.placeholder(tf.float32, shape=(None, n_inputs), name=\"X\")\n", + "y = tf.placeholder(tf.int64, shape=(None), name=\"y\")\n", + "\n", + "with tf.name_scope(\"dnn\"):\n", + " hidden1 = tf.layers.dense(X, n_hidden1, activation=tf.nn.relu, name=\"hidden1\")\n", + " hidden2 = tf.layers.dense(hidden1, n_hidden2, activation=tf.nn.relu, name=\"hidden2\")\n", + " logits = tf.layers.dense(hidden2, n_outputs, name=\"outputs\")\n", + "\n", + "with tf.name_scope(\"loss\"):\n", + " xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)\n", + " loss = tf.reduce_mean(xentropy, name=\"loss\")\n", + "\n", + "with tf.name_scope(\"eval\"):\n", + " correct = tf.nn.in_top_k(logits, y, 1)\n", + " accuracy = tf.reduce_mean(tf.cast(correct, tf.float32), name=\"accuracy\")" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "with tf.name_scope(\"train\"): # not shown in the book\n", + " initial_learning_rate = 0.1\n", + " decay_steps = 10000\n", + " decay_rate = 1/10\n", + " global_step = tf.Variable(0, trainable=False, name=\"global_step\")\n", + " learning_rate = tf.train.exponential_decay(initial_learning_rate, global_step,\n", + " decay_steps, decay_rate)\n", + " optimizer = tf.train.MomentumOptimizer(learning_rate, momentum=0.9)\n", + " training_op = optimizer.minimize(loss, global_step=global_step)" + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "init = tf.global_variables_initializer()\n", + "saver = tf.train.Saver()" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "n_epochs = 5\n", "batch_size = 50\n", "\n", "with tf.Session() as sess:\n", " init.run()\n", " for epoch in range(n_epochs):\n", - " for iteration in range(len(mnist.test.labels)//batch_size):\n", + " for iteration in range(mnist.train.num_examples // batch_size):\n", " X_batch, y_batch = mnist.train.next_batch(batch_size)\n", - " sess.run(training_op, feed_dict={is_training: True, X: X_batch, y: y_batch})\n", - " sess.run(clip_all_weights)\n", - " acc_train = accuracy.eval(feed_dict={is_training: False, X: X_batch, y: y_batch})\n", - " acc_test = accuracy.eval(feed_dict={is_training: False, X: mnist.test.images, y: mnist.test.labels})\n", - " print(epoch, \"Train accuracy:\", acc_train, \"Test accuracy:\", acc_test)\n", + " sess.run(training_op, feed_dict={X: X_batch, y: y_batch})\n", + " accuracy_val = accuracy.eval(feed_dict={X: mnist.test.images,\n", + " y: mnist.test.labels})\n", + " print(epoch, \"Test accuracy:\", accuracy_val)\n", "\n", - " save_path = saver.save(sess, \"my_model_final.ckpt\")" + " save_path = saver.save(sess, \"./my_model_final.ckpt\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "# Avoiding Overfitting Through Regularization" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "## $\\ell_1$ and $\\ell_2$ regularization" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Let's implement $\\ell_1$ regularization manually. First, we create the model, as usual (with just one hidden layer this time, for simplicity):" ] }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 81, + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "tf.reset_default_graph()\n", + "\n", + "n_inputs = 28 * 28 # MNIST\n", + "n_hidden1 = 300\n", + "n_outputs = 10\n", + "\n", + "X = tf.placeholder(tf.float32, shape=(None, n_inputs), name=\"X\")\n", + "y = tf.placeholder(tf.int64, shape=(None), name=\"y\")\n", + "\n", + "with tf.name_scope(\"dnn\"):\n", + " hidden1 = tf.layers.dense(X, n_hidden1, activation=tf.nn.relu, name=\"hidden1\")\n", + " logits = tf.layers.dense(hidden1, n_outputs, name=\"outputs\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Next, we get a handle on the layer weights, and we compute the total loss, which is equal to the sum of the usual cross entropy loss and the $\\ell_1$ loss (i.e., the absolute values of the weights):" + ] + }, + { + "cell_type": "code", + "execution_count": 82, "metadata": { "collapsed": false, "deletable": true, @@ -835,7 +2424,252 @@ }, "outputs": [], "source": [ - "show_graph(tf.get_default_graph())" + "W1 = tf.get_default_graph().get_tensor_by_name(\"hidden1/kernel:0\")\n", + "W2 = tf.get_default_graph().get_tensor_by_name(\"outputs/kernel:0\")\n", + "\n", + "scale = 0.001 # l1 regularization hyperparameter\n", + "\n", + "with tf.name_scope(\"loss\"):\n", + " xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y,\n", + " logits=logits)\n", + " base_loss = tf.reduce_mean(xentropy, name=\"avg_xentropy\")\n", + " reg_losses = tf.reduce_sum(tf.abs(W1)) + tf.reduce_sum(tf.abs(W2))\n", + " loss = tf.add(base_loss, scale * reg_losses, name=\"loss\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "The rest is just as usual:" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "with tf.name_scope(\"eval\"):\n", + " correct = tf.nn.in_top_k(logits, y, 1)\n", + " accuracy = tf.reduce_mean(tf.cast(correct, tf.float32), name=\"accuracy\")\n", + "\n", + "learning_rate = 0.01\n", + "\n", + "with tf.name_scope(\"train\"):\n", + " optimizer = tf.train.GradientDescentOptimizer(learning_rate)\n", + " training_op = optimizer.minimize(loss)\n", + "\n", + "init = tf.global_variables_initializer()\n", + "saver = tf.train.Saver()" + ] + }, + { + "cell_type": "code", + "execution_count": 84, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true, + "scrolled": true + }, + "outputs": [], + "source": [ + "n_epochs = 20\n", + "batch_size = 200\n", + "\n", + "with tf.Session() as sess:\n", + " init.run()\n", + " for epoch in range(n_epochs):\n", + " for iteration in range(mnist.train.num_examples // batch_size):\n", + " X_batch, y_batch = mnist.train.next_batch(batch_size)\n", + " sess.run(training_op, feed_dict={X: X_batch, y: y_batch})\n", + " accuracy_val = accuracy.eval(feed_dict={X: mnist.test.images,\n", + " y: mnist.test.labels})\n", + " print(epoch, \"Test accuracy:\", accuracy_val)\n", + "\n", + " save_path = saver.save(sess, \"./my_model_final.ckpt\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Alternatively, we can pass a regularization function to the `tf.layers.dense()` function, which will use it to create operations that will compute the regularization loss, and it adds these operations to the collection of regularization losses. The beginning is the same as above:" + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "tf.reset_default_graph()\n", + "\n", + "n_inputs = 28 * 28 # MNIST\n", + "n_hidden1 = 300\n", + "n_hidden2 = 50\n", + "n_outputs = 10\n", + "\n", + "X = tf.placeholder(tf.float32, shape=(None, n_inputs), name=\"X\")\n", + "y = tf.placeholder(tf.int64, shape=(None), name=\"y\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Next, we will use Python's `partial()` function to avoid repeating the same arguments over and over again. Note that we set the `kernel_regularizer` argument:" + ] + }, + { + "cell_type": "code", + "execution_count": 86, + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "scale = 0.001" + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "my_dense_layer = partial(\n", + " tf.layers.dense, activation=tf.nn.relu,\n", + " kernel_regularizer=tf.contrib.layers.l1_regularizer(scale))\n", + "\n", + "with tf.name_scope(\"dnn\"):\n", + " hidden1 = my_dense_layer(X, n_hidden1, name=\"hidden1\")\n", + " hidden2 = my_dense_layer(hidden1, n_hidden2, name=\"hidden2\")\n", + " logits = my_dense_layer(hidden2, n_outputs, activation=None,\n", + " name=\"outputs\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Next we must add the regularization losses to the base loss:" + ] + }, + { + "cell_type": "code", + "execution_count": 88, + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "with tf.name_scope(\"loss\"): # not shown in the book\n", + " xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits( # not shown\n", + " labels=y, logits=logits) # not shown\n", + " base_loss = tf.reduce_mean(xentropy, name=\"avg_xentropy\") # not shown\n", + " reg_losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)\n", + " loss = tf.add_n([base_loss] + reg_losses, name=\"loss\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "And the rest is the same as usual:" + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "with tf.name_scope(\"eval\"):\n", + " correct = tf.nn.in_top_k(logits, y, 1)\n", + " accuracy = tf.reduce_mean(tf.cast(correct, tf.float32), name=\"accuracy\")\n", + "\n", + "learning_rate = 0.01\n", + "\n", + "with tf.name_scope(\"train\"):\n", + " optimizer = tf.train.GradientDescentOptimizer(learning_rate)\n", + " training_op = optimizer.minimize(loss)\n", + "\n", + "init = tf.global_variables_initializer()\n", + "saver = tf.train.Saver()" + ] + }, + { + "cell_type": "code", + "execution_count": 90, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true, + "scrolled": true + }, + "outputs": [], + "source": [ + "n_epochs = 20\n", + "batch_size = 200\n", + "\n", + "with tf.Session() as sess:\n", + " init.run()\n", + " for epoch in range(n_epochs):\n", + " for iteration in range(mnist.train.num_examples // batch_size):\n", + " X_batch, y_batch = mnist.train.next_batch(batch_size)\n", + " sess.run(training_op, feed_dict={X: X_batch, y: y_batch})\n", + " accuracy_val = accuracy.eval(feed_dict={X: mnist.test.images,\n", + " y: mnist.test.labels})\n", + " print(epoch, \"Test accuracy:\", accuracy_val)\n", + "\n", + " save_path = saver.save(sess, \"./my_model_final.ckpt\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "## Dropout" ] }, { @@ -852,7 +2686,23 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 91, + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "tf.reset_default_graph()\n", + "\n", + "X = tf.placeholder(tf.float32, shape=(None, n_inputs), name=\"X\")\n", + "y = tf.placeholder(tf.int64, shape=(None), name=\"y\")" + ] + }, + { + "cell_type": "code", + "execution_count": 92, "metadata": { "collapsed": false, "deletable": true, @@ -860,45 +2710,38 @@ }, "outputs": [], "source": [ - "from functools import partial\n", + "training = tf.placeholder_with_default(False, shape=(), name='training')\n", "\n", - "tf.reset_default_graph()\n", - "\n", - "X = tf.placeholder(tf.float32, shape=(None, n_inputs), name=\"X\")\n", - "y = tf.placeholder(tf.int64, shape=(None), name=\"y\")\n", - "is_training = tf.placeholder(tf.bool, shape=(), name='is_training')\n", - "\n", - "initial_learning_rate = 0.1\n", - "decay_steps = 10000\n", - "decay_rate = 1/10\n", - "global_step = tf.Variable(0, trainable=False)\n", - "learning_rate = tf.train.exponential_decay(initial_learning_rate, global_step,\n", - " decay_steps, decay_rate)\n", - "\n", - "dropout_rate = 0.5\n", + "dropout_rate = 0.5 # == 1 - keep_prob\n", + "X_drop = tf.layers.dropout(X, dropout_rate, training=training)\n", "\n", "with tf.name_scope(\"dnn\"):\n", - " he_init = tf.contrib.layers.variance_scaling_initializer()\n", - "\n", - " my_dense_layer = partial(\n", - " tf.layers.dense,\n", - " activation=tf.nn.elu,\n", - " kernel_initializer=he_init)\n", - "\n", - " X_drop = tf.layers.dropout(X, dropout_rate, training=is_training)\n", - " hidden1 = my_dense_layer(X_drop, n_hidden1, name=\"hidden1\")\n", - " hidden1_drop = tf.layers.dropout(hidden1, dropout_rate, training=is_training)\n", - " hidden2 = my_dense_layer(hidden1_drop, n_hidden2, name=\"hidden2\")\n", - " hidden2_drop = tf.layers.dropout(hidden2, dropout_rate, training=is_training)\n", - " logits = my_dense_layer(hidden2_drop, n_outputs, activation=None, name=\"outputs\")\n", - "\n", + " hidden1 = tf.layers.dense(X_drop, n_hidden1, activation=tf.nn.relu,\n", + " name=\"hidden1\")\n", + " hidden1_drop = tf.layers.dropout(hidden1, dropout_rate, training=training)\n", + " hidden2 = tf.layers.dense(hidden1_drop, n_hidden2, activation=tf.nn.relu,\n", + " name=\"hidden2\")\n", + " hidden2_drop = tf.layers.dropout(hidden2, dropout_rate, training=training)\n", + " logits = tf.layers.dense(hidden2_drop, n_outputs, name=\"outputs\")" + ] + }, + { + "cell_type": "code", + "execution_count": 93, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ "with tf.name_scope(\"loss\"):\n", " xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)\n", " loss = tf.reduce_mean(xentropy, name=\"loss\")\n", "\n", "with tf.name_scope(\"train\"):\n", - " optimizer = tf.train.MomentumOptimizer(learning_rate, momentum)\n", - " training_op = optimizer.minimize(loss, global_step=global_step) \n", + " optimizer = tf.train.MomentumOptimizer(learning_rate, momentum=0.9)\n", + " training_op = optimizer.minimize(loss) \n", "\n", "with tf.name_scope(\"eval\"):\n", " correct = tf.nn.in_top_k(logits, y, 1)\n", @@ -910,11 +2753,12 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 94, "metadata": { "collapsed": false, "deletable": true, - "editable": true + "editable": true, + "scrolled": true }, "outputs": [], "source": [ @@ -924,19 +2768,115 @@ "with tf.Session() as sess:\n", " init.run()\n", " for epoch in range(n_epochs):\n", - " for iteration in range(len(mnist.test.labels)//batch_size):\n", + " for iteration in range(mnist.train.num_examples // batch_size):\n", " X_batch, y_batch = mnist.train.next_batch(batch_size)\n", - " sess.run(training_op, feed_dict={is_training: True, X: X_batch, y: y_batch})\n", - " acc_train = accuracy.eval(feed_dict={is_training: False, X: X_batch, y: y_batch})\n", - " acc_test = accuracy.eval(feed_dict={is_training: False, X: mnist.test.images, y: mnist.test.labels})\n", - " print(epoch, \"Train accuracy:\", acc_train, \"Test accuracy:\", acc_test)\n", + " sess.run(training_op, feed_dict={training: True, X: X_batch, y: y_batch})\n", + " acc_test = accuracy.eval(feed_dict={X: mnist.test.images, y: mnist.test.labels})\n", + " print(epoch, \"Test accuracy:\", acc_test)\n", "\n", - " save_path = saver.save(sess, \"my_model_final.ckpt\")" + " save_path = saver.save(sess, \"./my_model_final.ckpt\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "## Max norm" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Let's go back to a plain and simple neural net for MNIST with just 2 hidden layers:" ] }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 95, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "tf.reset_default_graph()\n", + "\n", + "n_inputs = 28 * 28\n", + "n_hidden1 = 300\n", + "n_hidden2 = 50\n", + "n_outputs = 10\n", + "\n", + "learning_rate = 0.01\n", + "momentum = 0.9\n", + "\n", + "X = tf.placeholder(tf.float32, shape=(None, n_inputs), name=\"X\")\n", + "y = tf.placeholder(tf.int64, shape=(None), name=\"y\")\n", + "\n", + "with tf.name_scope(\"dnn\"):\n", + " hidden1 = tf.layers.dense(X, n_hidden1, activation=tf.nn.relu, name=\"hidden1\")\n", + " hidden2 = tf.layers.dense(hidden1, n_hidden2, activation=tf.nn.relu, name=\"hidden2\")\n", + " logits = tf.layers.dense(hidden2, n_outputs, name=\"outputs\")\n", + "\n", + "with tf.name_scope(\"loss\"):\n", + " xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)\n", + " loss = tf.reduce_mean(xentropy, name=\"loss\")\n", + "\n", + "with tf.name_scope(\"train\"):\n", + " optimizer = tf.train.MomentumOptimizer(learning_rate, momentum)\n", + " training_op = optimizer.minimize(loss) \n", + "\n", + "with tf.name_scope(\"eval\"):\n", + " correct = tf.nn.in_top_k(logits, y, 1)\n", + " accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Next, let's get a handle on the first hidden layer's weight and create an operation that will compute the clipped weights using the `clip_by_norm()` function. Then we create an assignment operation to assign the clipped weights to the weights variable:" + ] + }, + { + "cell_type": "code", + "execution_count": 96, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "threshold = 1.0\n", + "weights = tf.get_default_graph().get_tensor_by_name(\"hidden1/kernel:0\")\n", + "clipped_weights = tf.clip_by_norm(weights, clip_norm=threshold, axes=1)\n", + "clip_weights = tf.assign(weights, clipped_weights)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "We can do this as well for the second hidden layer:" + ] + }, + { + "cell_type": "code", + "execution_count": 97, "metadata": { "collapsed": true, "deletable": true, @@ -944,13 +2884,62 @@ }, "outputs": [], "source": [ - "train_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,\n", - " scope=\"hidden[2]|outputs\")" + "weights2 = tf.get_default_graph().get_tensor_by_name(\"hidden2/kernel:0\")\n", + "clipped_weights2 = tf.clip_by_norm(weights2, clip_norm=threshold, axes=1)\n", + "clip_weights2 = tf.assign(weights2, clipped_weights2)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Let's add an initializer and a saver:" ] }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 98, + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "init = tf.global_variables_initializer()\n", + "saver = tf.train.Saver()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "And now we can train the model. It's pretty much as usual, except that right after running the `training_op`, we run the `clip_weights` and `clip_weights2` operations:" + ] + }, + { + "cell_type": "code", + "execution_count": 99, + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "n_epochs = 20\n", + "batch_size = 50" + ] + }, + { + "cell_type": "code", + "execution_count": 100, "metadata": { "collapsed": false, "deletable": true, @@ -958,12 +2947,64 @@ }, "outputs": [], "source": [ - "training_op2 = optimizer.minimize(loss, var_list=train_vars)" + "with tf.Session() as sess: # not shown in the book\n", + " init.run() # not shown\n", + " for epoch in range(n_epochs): # not shown\n", + " for iteration in range(mnist.train.num_examples // batch_size): # not shown\n", + " X_batch, y_batch = mnist.train.next_batch(batch_size) # not shown\n", + " sess.run(training_op, feed_dict={X: X_batch, y: y_batch})\n", + " clip_weights.eval()\n", + " clip_weights2.eval() # not shown\n", + " acc_test = accuracy.eval(feed_dict={X: mnist.test.images, # not shown\n", + " y: mnist.test.labels}) # not shown\n", + " print(epoch, \"Test accuracy:\", acc_test) # not shown\n", + "\n", + " save_path = saver.save(sess, \"./my_model_final.ckpt\") # not shown" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "The implementation above is straightforward and it works fine, but it is a bit messy. A better approach is to define a `max_norm_regularizer()` function:" ] }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 101, + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "def max_norm_regularizer(threshold, axes=1, name=\"max_norm\",\n", + " collection=\"max_norm\"):\n", + " def max_norm(weights):\n", + " clipped = tf.clip_by_norm(weights, clip_norm=threshold, axes=axes)\n", + " clip_weights = tf.assign(weights, clipped, name=name)\n", + " tf.add_to_collection(collection, clip_weights)\n", + " return None # there is no regularization loss term\n", + " return max_norm" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Then you can call this function to get a max norm regularizer (with the threshold you want). When you create a hidden layer, you can pass this regularizer to the `kernel_regularizer` argument:" + ] + }, + { + "cell_type": "code", + "execution_count": 102, "metadata": { "collapsed": false, "deletable": true, @@ -971,36 +3012,115 @@ }, "outputs": [], "source": [ - "for i in tf.global_variables():\n", - " print(i.name)" + "tf.reset_default_graph()\n", + "\n", + "n_inputs = 28 * 28\n", + "n_hidden1 = 300\n", + "n_hidden2 = 50\n", + "n_outputs = 10\n", + "\n", + "learning_rate = 0.01\n", + "momentum = 0.9\n", + "\n", + "X = tf.placeholder(tf.float32, shape=(None, n_inputs), name=\"X\")\n", + "y = tf.placeholder(tf.int64, shape=(None), name=\"y\")" ] }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 103, "metadata": { - "collapsed": false, + "collapsed": true, "deletable": true, "editable": true }, "outputs": [], "source": [ - "for i in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES):\n", - " print(i.name)" + "max_norm_reg = max_norm_regularizer(threshold=1.0)\n", + "\n", + "with tf.name_scope(\"dnn\"):\n", + " hidden1 = tf.layers.dense(X, n_hidden1, activation=tf.nn.relu,\n", + " kernel_regularizer=max_norm_reg, name=\"hidden1\")\n", + " hidden2 = tf.layers.dense(hidden1, n_hidden2, activation=tf.nn.relu,\n", + " kernel_regularizer=max_norm_reg, name=\"hidden2\")\n", + " logits = tf.layers.dense(hidden2, n_outputs, name=\"outputs\")" ] }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 104, "metadata": { - "collapsed": false, + "collapsed": true, "deletable": true, "editable": true }, "outputs": [], "source": [ - "for i in train_vars:\n", - " print(i.name)" + "with tf.name_scope(\"loss\"):\n", + " xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)\n", + " loss = tf.reduce_mean(xentropy, name=\"loss\")\n", + "\n", + "with tf.name_scope(\"train\"):\n", + " optimizer = tf.train.MomentumOptimizer(learning_rate, momentum)\n", + " training_op = optimizer.minimize(loss) \n", + "\n", + "with tf.name_scope(\"eval\"):\n", + " correct = tf.nn.in_top_k(logits, y, 1)\n", + " accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))\n", + "\n", + "init = tf.global_variables_initializer()\n", + "saver = tf.train.Saver()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "Training is as usual, except you must run the weights clipping operations after each training operation:" + ] + }, + { + "cell_type": "code", + "execution_count": 105, + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "n_epochs = 20\n", + "batch_size = 50" + ] + }, + { + "cell_type": "code", + "execution_count": 106, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true, + "scrolled": false + }, + "outputs": [], + "source": [ + "clip_all_weights = tf.get_collection(\"max_norm\")\n", + "\n", + "with tf.Session() as sess:\n", + " init.run()\n", + " for epoch in range(n_epochs):\n", + " for iteration in range(mnist.train.num_examples // batch_size):\n", + " X_batch, y_batch = mnist.train.next_batch(batch_size)\n", + " sess.run(training_op, feed_dict={X: X_batch, y: y_batch})\n", + " sess.run(clip_all_weights)\n", + " acc_test = accuracy.eval(feed_dict={X: mnist.test.images, # not shown in the book\n", + " y: mnist.test.labels}) # not shown\n", + " print(epoch, \"Test accuracy:\", acc_test) # not shown\n", + "\n", + " save_path = saver.save(sess, \"./my_model_final.ckpt\") # not shown" ] }, {