From b5721714f779a657bf7931a4d054264cf5669e06 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Geron?= Date: Sun, 25 Sep 2016 18:42:01 +0200 Subject: [PATCH] Add TF models slim nets --- .gitignore | 1 + nets/README.md | 5 + nets/__init__.py | 1 + nets/alexnet.py | 125 +++++++ nets/alexnet_test.py | 145 ++++++++ nets/cifarnet.py | 112 ++++++ nets/inception.py | 33 ++ nets/inception_resnet_v2.py | 280 +++++++++++++++ nets/inception_resnet_v2_test.py | 136 +++++++ nets/inception_v1.py | 340 ++++++++++++++++++ nets/inception_v1_test.py | 210 +++++++++++ nets/inception_v2.py | 545 ++++++++++++++++++++++++++++ nets/inception_v2_test.py | 262 ++++++++++++++ nets/inception_v3.py | 587 +++++++++++++++++++++++++++++++ nets/inception_v3_test.py | 292 +++++++++++++++ nets/lenet.py | 93 +++++ nets/nets_factory.py | 107 ++++++ nets/nets_factory_test.py | 46 +++ nets/overfeat.py | 118 +++++++ nets/overfeat_test.py | 145 ++++++++ nets/resnet_utils.py | 254 +++++++++++++ nets/resnet_v1.py | 295 ++++++++++++++++ nets/resnet_v1_test.py | 450 +++++++++++++++++++++++ nets/resnet_v2.py | 302 ++++++++++++++++ nets/resnet_v2_test.py | 453 ++++++++++++++++++++++++ nets/vgg.py | 244 +++++++++++++ nets/vgg_test.py | 455 ++++++++++++++++++++++++ 27 files changed, 6036 insertions(+) create mode 100644 nets/README.md create mode 100644 nets/__init__.py create mode 100644 nets/alexnet.py create mode 100644 nets/alexnet_test.py create mode 100644 nets/cifarnet.py create mode 100644 nets/inception.py create mode 100644 nets/inception_resnet_v2.py create mode 100644 nets/inception_resnet_v2_test.py create mode 100644 nets/inception_v1.py create mode 100644 nets/inception_v1_test.py create mode 100644 nets/inception_v2.py create mode 100644 nets/inception_v2_test.py create mode 100644 nets/inception_v3.py create mode 100644 nets/inception_v3_test.py create mode 100644 nets/lenet.py create mode 100644 nets/nets_factory.py create mode 100644 nets/nets_factory_test.py create mode 100644 nets/overfeat.py create mode 100644 nets/overfeat_test.py create mode 100644 nets/resnet_utils.py create mode 100644 nets/resnet_v1.py create mode 100644 nets/resnet_v1_test.py create mode 100644 nets/resnet_v2.py create mode 100644 nets/resnet_v2_test.py create mode 100644 nets/vgg.py create mode 100644 nets/vgg_test.py diff --git a/.gitignore b/.gitignore index 26d8704..a5684d8 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,4 @@ .DS_Store my_* images/**/*.png +*.pyc diff --git a/nets/README.md b/nets/README.md new file mode 100644 index 0000000..8f1ce71 --- /dev/null +++ b/nets/README.md @@ -0,0 +1,5 @@ +This directory was copied from: +https://github.com/tensorflow/models/blob/master/slim/nets + +On Sept. 25th, 2016. Commit: +https://github.com/tensorflow/models/commit/65fad62dc6daca5a72c204013824cc380939d457 diff --git a/nets/__init__.py b/nets/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/nets/__init__.py @@ -0,0 +1 @@ + diff --git a/nets/alexnet.py b/nets/alexnet.py new file mode 100644 index 0000000..7aca487 --- /dev/null +++ b/nets/alexnet.py @@ -0,0 +1,125 @@ +# Copyright 2016 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Contains a model definition for AlexNet. + +This work was first described in: + ImageNet Classification with Deep Convolutional Neural Networks + Alex Krizhevsky, Ilya Sutskever and Geoffrey E. Hinton + +and later refined in: + One weird trick for parallelizing convolutional neural networks + Alex Krizhevsky, 2014 + +Here we provide the implementation proposed in "One weird trick" and not +"ImageNet Classification", as per the paper, the LRN layers have been removed. + +Usage: + with slim.arg_scope(alexnet.alexnet_v2_arg_scope()): + outputs, end_points = alexnet.alexnet_v2(inputs) + +@@alexnet_v2 +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import tensorflow as tf + +slim = tf.contrib.slim +trunc_normal = lambda stddev: tf.truncated_normal_initializer(0.0, stddev) + + +def alexnet_v2_arg_scope(weight_decay=0.0005): + with slim.arg_scope([slim.conv2d, slim.fully_connected], + activation_fn=tf.nn.relu, + biases_initializer=tf.constant_initializer(0.1), + weights_regularizer=slim.l2_regularizer(weight_decay)): + with slim.arg_scope([slim.conv2d], padding='SAME'): + with slim.arg_scope([slim.max_pool2d], padding='VALID') as arg_sc: + return arg_sc + + +def alexnet_v2(inputs, + num_classes=1000, + is_training=True, + dropout_keep_prob=0.5, + spatial_squeeze=True, + scope='alexnet_v2'): + """AlexNet version 2. + + Described in: http://arxiv.org/pdf/1404.5997v2.pdf + Parameters from: + github.com/akrizhevsky/cuda-convnet2/blob/master/layers/ + layers-imagenet-1gpu.cfg + + Note: All the fully_connected layers have been transformed to conv2d layers. + To use in classification mode, resize input to 224x224. To use in fully + convolutional mode, set spatial_squeeze to false. + The LRN layers have been removed and change the initializers from + random_normal_initializer to xavier_initializer. + + Args: + inputs: a tensor of size [batch_size, height, width, channels]. + num_classes: number of predicted classes. + is_training: whether or not the model is being trained. + dropout_keep_prob: the probability that activations are kept in the dropout + layers during training. + spatial_squeeze: whether or not should squeeze the spatial dimensions of the + outputs. Useful to remove unnecessary dimensions for classification. + scope: Optional scope for the variables. + + Returns: + the last op containing the log predictions and end_points dict. + """ + with tf.variable_scope(scope, 'alexnet_v2', [inputs]) as sc: + end_points_collection = sc.name + '_end_points' + # Collect outputs for conv2d, fully_connected and max_pool2d. + with slim.arg_scope([slim.conv2d, slim.fully_connected, slim.max_pool2d], + outputs_collections=[end_points_collection]): + net = slim.conv2d(inputs, 64, [11, 11], 4, padding='VALID', + scope='conv1') + net = slim.max_pool2d(net, [3, 3], 2, scope='pool1') + net = slim.conv2d(net, 192, [5, 5], scope='conv2') + net = slim.max_pool2d(net, [3, 3], 2, scope='pool2') + net = slim.conv2d(net, 384, [3, 3], scope='conv3') + net = slim.conv2d(net, 384, [3, 3], scope='conv4') + net = slim.conv2d(net, 256, [3, 3], scope='conv5') + net = slim.max_pool2d(net, [3, 3], 2, scope='pool5') + + # Use conv2d instead of fully_connected layers. + with slim.arg_scope([slim.conv2d], + weights_initializer=trunc_normal(0.005), + biases_initializer=tf.constant_initializer(0.1)): + net = slim.conv2d(net, 4096, [5, 5], padding='VALID', + scope='fc6') + net = slim.dropout(net, dropout_keep_prob, is_training=is_training, + scope='dropout6') + net = slim.conv2d(net, 4096, [1, 1], scope='fc7') + net = slim.dropout(net, dropout_keep_prob, is_training=is_training, + scope='dropout7') + net = slim.conv2d(net, num_classes, [1, 1], + activation_fn=None, + normalizer_fn=None, + biases_initializer=tf.zeros_initializer, + scope='fc8') + + # Convert end_points_collection into a end_point dict. + end_points = dict(tf.get_collection(end_points_collection)) + if spatial_squeeze: + net = tf.squeeze(net, [1, 2], name='fc8/squeezed') + end_points[sc.name + '/fc8'] = net + return net, end_points +alexnet_v2.default_image_size = 224 diff --git a/nets/alexnet_test.py b/nets/alexnet_test.py new file mode 100644 index 0000000..493c460 --- /dev/null +++ b/nets/alexnet_test.py @@ -0,0 +1,145 @@ +# Copyright 2016 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for slim.nets.alexnet.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import tensorflow as tf + +from nets import alexnet + +slim = tf.contrib.slim + + +class AlexnetV2Test(tf.test.TestCase): + + def testBuild(self): + batch_size = 5 + height, width = 224, 224 + num_classes = 1000 + with self.test_session(): + inputs = tf.random_uniform((batch_size, height, width, 3)) + logits, _ = alexnet.alexnet_v2(inputs, num_classes) + self.assertEquals(logits.op.name, 'alexnet_v2/fc8/squeezed') + self.assertListEqual(logits.get_shape().as_list(), + [batch_size, num_classes]) + + def testFullyConvolutional(self): + batch_size = 1 + height, width = 300, 400 + num_classes = 1000 + with self.test_session(): + inputs = tf.random_uniform((batch_size, height, width, 3)) + logits, _ = alexnet.alexnet_v2(inputs, num_classes, spatial_squeeze=False) + self.assertEquals(logits.op.name, 'alexnet_v2/fc8/BiasAdd') + self.assertListEqual(logits.get_shape().as_list(), + [batch_size, 4, 7, num_classes]) + + def testEndPoints(self): + batch_size = 5 + height, width = 224, 224 + num_classes = 1000 + with self.test_session(): + inputs = tf.random_uniform((batch_size, height, width, 3)) + _, end_points = alexnet.alexnet_v2(inputs, num_classes) + expected_names = ['alexnet_v2/conv1', + 'alexnet_v2/pool1', + 'alexnet_v2/conv2', + 'alexnet_v2/pool2', + 'alexnet_v2/conv3', + 'alexnet_v2/conv4', + 'alexnet_v2/conv5', + 'alexnet_v2/pool5', + 'alexnet_v2/fc6', + 'alexnet_v2/fc7', + 'alexnet_v2/fc8' + ] + self.assertSetEqual(set(end_points.keys()), set(expected_names)) + + def testModelVariables(self): + batch_size = 5 + height, width = 224, 224 + num_classes = 1000 + with self.test_session(): + inputs = tf.random_uniform((batch_size, height, width, 3)) + alexnet.alexnet_v2(inputs, num_classes) + expected_names = ['alexnet_v2/conv1/weights', + 'alexnet_v2/conv1/biases', + 'alexnet_v2/conv2/weights', + 'alexnet_v2/conv2/biases', + 'alexnet_v2/conv3/weights', + 'alexnet_v2/conv3/biases', + 'alexnet_v2/conv4/weights', + 'alexnet_v2/conv4/biases', + 'alexnet_v2/conv5/weights', + 'alexnet_v2/conv5/biases', + 'alexnet_v2/fc6/weights', + 'alexnet_v2/fc6/biases', + 'alexnet_v2/fc7/weights', + 'alexnet_v2/fc7/biases', + 'alexnet_v2/fc8/weights', + 'alexnet_v2/fc8/biases', + ] + model_variables = [v.op.name for v in slim.get_model_variables()] + self.assertSetEqual(set(model_variables), set(expected_names)) + + def testEvaluation(self): + batch_size = 2 + height, width = 224, 224 + num_classes = 1000 + with self.test_session(): + eval_inputs = tf.random_uniform((batch_size, height, width, 3)) + logits, _ = alexnet.alexnet_v2(eval_inputs, is_training=False) + self.assertListEqual(logits.get_shape().as_list(), + [batch_size, num_classes]) + predictions = tf.argmax(logits, 1) + self.assertListEqual(predictions.get_shape().as_list(), [batch_size]) + + def testTrainEvalWithReuse(self): + train_batch_size = 2 + eval_batch_size = 1 + train_height, train_width = 224, 224 + eval_height, eval_width = 300, 400 + num_classes = 1000 + with self.test_session(): + train_inputs = tf.random_uniform( + (train_batch_size, train_height, train_width, 3)) + logits, _ = alexnet.alexnet_v2(train_inputs) + self.assertListEqual(logits.get_shape().as_list(), + [train_batch_size, num_classes]) + tf.get_variable_scope().reuse_variables() + eval_inputs = tf.random_uniform( + (eval_batch_size, eval_height, eval_width, 3)) + logits, _ = alexnet.alexnet_v2(eval_inputs, is_training=False, + spatial_squeeze=False) + self.assertListEqual(logits.get_shape().as_list(), + [eval_batch_size, 4, 7, num_classes]) + logits = tf.reduce_mean(logits, [1, 2]) + predictions = tf.argmax(logits, 1) + self.assertEquals(predictions.get_shape().as_list(), [eval_batch_size]) + + def testForward(self): + batch_size = 1 + height, width = 224, 224 + with self.test_session() as sess: + inputs = tf.random_uniform((batch_size, height, width, 3)) + logits, _ = alexnet.alexnet_v2(inputs) + sess.run(tf.initialize_all_variables()) + output = sess.run(logits) + self.assertTrue(output.any()) + +if __name__ == '__main__': + tf.test.main() diff --git a/nets/cifarnet.py b/nets/cifarnet.py new file mode 100644 index 0000000..371a9cb --- /dev/null +++ b/nets/cifarnet.py @@ -0,0 +1,112 @@ +# Copyright 2016 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Contains a variant of the CIFAR-10 model definition.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import tensorflow as tf + +slim = tf.contrib.slim + +trunc_normal = lambda stddev: tf.truncated_normal_initializer(stddev=stddev) + + +def cifarnet(images, num_classes=10, is_training=False, + dropout_keep_prob=0.5, + prediction_fn=slim.softmax, + scope='CifarNet'): + """Creates a variant of the CifarNet model. + + Note that since the output is a set of 'logits', the values fall in the + interval of (-infinity, infinity). Consequently, to convert the outputs to a + probability distribution over the characters, one will need to convert them + using the softmax function: + + logits = cifarnet.cifarnet(images, is_training=False) + probabilities = tf.nn.softmax(logits) + predictions = tf.argmax(logits, 1) + + Args: + images: A batch of `Tensors` of size [batch_size, height, width, channels]. + num_classes: the number of classes in the dataset. + is_training: specifies whether or not we're currently training the model. + This variable will determine the behaviour of the dropout layer. + dropout_keep_prob: the percentage of activation values that are retained. + prediction_fn: a function to get predictions out of logits. + scope: Optional variable_scope. + + Returns: + logits: the pre-softmax activations, a tensor of size + [batch_size, `num_classes`] + end_points: a dictionary from components of the network to the corresponding + activation. + """ + end_points = {} + + with tf.variable_scope(scope, 'CifarNet', [images, num_classes]): + net = slim.conv2d(images, 64, [5, 5], scope='conv1') + end_points['conv1'] = net + net = slim.max_pool2d(net, [2, 2], 2, scope='pool1') + end_points['pool1'] = net + net = tf.nn.lrn(net, 4, bias=1.0, alpha=0.001/9.0, beta=0.75, name='norm1') + net = slim.conv2d(net, 64, [5, 5], scope='conv2') + end_points['conv2'] = net + net = tf.nn.lrn(net, 4, bias=1.0, alpha=0.001/9.0, beta=0.75, name='norm2') + net = slim.max_pool2d(net, [2, 2], 2, scope='pool2') + end_points['pool2'] = net + net = slim.flatten(net) + end_points['Flatten'] = net + net = slim.fully_connected(net, 384, scope='fc3') + end_points['fc3'] = net + net = slim.dropout(net, dropout_keep_prob, is_training=is_training, + scope='dropout3') + net = slim.fully_connected(net, 192, scope='fc4') + end_points['fc4'] = net + logits = slim.fully_connected(net, num_classes, + biases_initializer=tf.zeros_initializer, + weights_initializer=trunc_normal(1/192.0), + weights_regularizer=None, + activation_fn=None, + scope='logits') + + end_points['Logits'] = logits + end_points['Predictions'] = prediction_fn(logits, scope='Predictions') + + return logits, end_points +cifarnet.default_image_size = 32 + + +def cifarnet_arg_scope(weight_decay=0.004): + """Defines the default cifarnet argument scope. + + Args: + weight_decay: The weight decay to use for regularizing the model. + + Returns: + An `arg_scope` to use for the inception v3 model. + """ + with slim.arg_scope( + [slim.conv2d], + weights_initializer=tf.truncated_normal_initializer(stddev=5e-2), + activation_fn=tf.nn.relu): + with slim.arg_scope( + [slim.fully_connected], + biases_initializer=tf.constant_initializer(0.1), + weights_initializer=trunc_normal(0.04), + weights_regularizer=slim.l2_regularizer(weight_decay), + activation_fn=tf.nn.relu) as sc: + return sc diff --git a/nets/inception.py b/nets/inception.py new file mode 100644 index 0000000..0479e3d --- /dev/null +++ b/nets/inception.py @@ -0,0 +1,33 @@ +# Copyright 2016 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Brings inception_v1, inception_v2 and inception_v3 under one namespace.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +# pylint: disable=unused-import +from nets.inception_resnet_v2 import inception_resnet_v2 +from nets.inception_resnet_v2 import inception_resnet_v2_arg_scope +from nets.inception_v1 import inception_v1 +from nets.inception_v1 import inception_v1_arg_scope +from nets.inception_v1 import inception_v1_base +from nets.inception_v2 import inception_v2 +from nets.inception_v2 import inception_v2_arg_scope +from nets.inception_v2 import inception_v2_base +from nets.inception_v3 import inception_v3 +from nets.inception_v3 import inception_v3_arg_scope +from nets.inception_v3 import inception_v3_base +# pylint: enable=unused-import diff --git a/nets/inception_resnet_v2.py b/nets/inception_resnet_v2.py new file mode 100644 index 0000000..4b3c5bd --- /dev/null +++ b/nets/inception_resnet_v2.py @@ -0,0 +1,280 @@ +# Copyright 2016 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Contains the definition of the Inception Resnet V2 architecture. + +As described in http://arxiv.org/abs/1602.07261. + + Inception-v4, Inception-ResNet and the Impact of Residual Connections + on Learning + Christian Szegedy, Sergey Ioffe, Vincent Vanhoucke, Alex Alemi +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + + +import tensorflow as tf + +slim = tf.contrib.slim + + +def block35(net, scale=1.0, activation_fn=tf.nn.relu, scope=None, reuse=None): + """Builds the 35x35 resnet block.""" + with tf.variable_scope(scope, 'Block35', [net], reuse=reuse): + with tf.variable_scope('Branch_0'): + tower_conv = slim.conv2d(net, 32, 1, scope='Conv2d_1x1') + with tf.variable_scope('Branch_1'): + tower_conv1_0 = slim.conv2d(net, 32, 1, scope='Conv2d_0a_1x1') + tower_conv1_1 = slim.conv2d(tower_conv1_0, 32, 3, scope='Conv2d_0b_3x3') + with tf.variable_scope('Branch_2'): + tower_conv2_0 = slim.conv2d(net, 32, 1, scope='Conv2d_0a_1x1') + tower_conv2_1 = slim.conv2d(tower_conv2_0, 48, 3, scope='Conv2d_0b_3x3') + tower_conv2_2 = slim.conv2d(tower_conv2_1, 64, 3, scope='Conv2d_0c_3x3') + mixed = tf.concat(3, [tower_conv, tower_conv1_1, tower_conv2_2]) + up = slim.conv2d(mixed, net.get_shape()[3], 1, normalizer_fn=None, + activation_fn=None, scope='Conv2d_1x1') + net += scale * up + if activation_fn: + net = activation_fn(net) + return net + + +def block17(net, scale=1.0, activation_fn=tf.nn.relu, scope=None, reuse=None): + """Builds the 17x17 resnet block.""" + with tf.variable_scope(scope, 'Block17', [net], reuse=reuse): + with tf.variable_scope('Branch_0'): + tower_conv = slim.conv2d(net, 192, 1, scope='Conv2d_1x1') + with tf.variable_scope('Branch_1'): + tower_conv1_0 = slim.conv2d(net, 128, 1, scope='Conv2d_0a_1x1') + tower_conv1_1 = slim.conv2d(tower_conv1_0, 160, [1, 7], + scope='Conv2d_0b_1x7') + tower_conv1_2 = slim.conv2d(tower_conv1_1, 192, [7, 1], + scope='Conv2d_0c_7x1') + mixed = tf.concat(3, [tower_conv, tower_conv1_2]) + up = slim.conv2d(mixed, net.get_shape()[3], 1, normalizer_fn=None, + activation_fn=None, scope='Conv2d_1x1') + net += scale * up + if activation_fn: + net = activation_fn(net) + return net + + +def block8(net, scale=1.0, activation_fn=tf.nn.relu, scope=None, reuse=None): + """Builds the 8x8 resnet block.""" + with tf.variable_scope(scope, 'Block8', [net], reuse=reuse): + with tf.variable_scope('Branch_0'): + tower_conv = slim.conv2d(net, 192, 1, scope='Conv2d_1x1') + with tf.variable_scope('Branch_1'): + tower_conv1_0 = slim.conv2d(net, 192, 1, scope='Conv2d_0a_1x1') + tower_conv1_1 = slim.conv2d(tower_conv1_0, 224, [1, 3], + scope='Conv2d_0b_1x3') + tower_conv1_2 = slim.conv2d(tower_conv1_1, 256, [3, 1], + scope='Conv2d_0c_3x1') + mixed = tf.concat(3, [tower_conv, tower_conv1_2]) + up = slim.conv2d(mixed, net.get_shape()[3], 1, normalizer_fn=None, + activation_fn=None, scope='Conv2d_1x1') + net += scale * up + if activation_fn: + net = activation_fn(net) + return net + + +def inception_resnet_v2(inputs, num_classes=1001, is_training=True, + dropout_keep_prob=0.8, + reuse=None, + scope='InceptionResnetV2'): + """Creates the Inception Resnet V2 model. + + Args: + inputs: a 4-D tensor of size [batch_size, height, width, 3]. + num_classes: number of predicted classes. + is_training: whether is training or not. + dropout_keep_prob: float, the fraction to keep before final layer. + reuse: whether or not the network and its variables should be reused. To be + able to reuse 'scope' must be given. + scope: Optional variable_scope. + + Returns: + logits: the logits outputs of the model. + end_points: the set of end_points from the inception model. + """ + end_points = {} + + with tf.variable_scope(scope, 'InceptionResnetV2', [inputs], reuse=reuse): + with slim.arg_scope([slim.batch_norm, slim.dropout], + is_training=is_training): + with slim.arg_scope([slim.conv2d, slim.max_pool2d, slim.avg_pool2d], + stride=1, padding='SAME'): + + # 149 x 149 x 32 + net = slim.conv2d(inputs, 32, 3, stride=2, padding='VALID', + scope='Conv2d_1a_3x3') + end_points['Conv2d_1a_3x3'] = net + # 147 x 147 x 32 + net = slim.conv2d(net, 32, 3, padding='VALID', + scope='Conv2d_2a_3x3') + end_points['Conv2d_2a_3x3'] = net + # 147 x 147 x 64 + net = slim.conv2d(net, 64, 3, scope='Conv2d_2b_3x3') + end_points['Conv2d_2b_3x3'] = net + # 73 x 73 x 64 + net = slim.max_pool2d(net, 3, stride=2, padding='VALID', + scope='MaxPool_3a_3x3') + end_points['MaxPool_3a_3x3'] = net + # 73 x 73 x 80 + net = slim.conv2d(net, 80, 1, padding='VALID', + scope='Conv2d_3b_1x1') + end_points['Conv2d_3b_1x1'] = net + # 71 x 71 x 192 + net = slim.conv2d(net, 192, 3, padding='VALID', + scope='Conv2d_4a_3x3') + end_points['Conv2d_4a_3x3'] = net + # 35 x 35 x 192 + net = slim.max_pool2d(net, 3, stride=2, padding='VALID', + scope='MaxPool_5a_3x3') + end_points['MaxPool_5a_3x3'] = net + + # 35 x 35 x 320 + with tf.variable_scope('Mixed_5b'): + with tf.variable_scope('Branch_0'): + tower_conv = slim.conv2d(net, 96, 1, scope='Conv2d_1x1') + with tf.variable_scope('Branch_1'): + tower_conv1_0 = slim.conv2d(net, 48, 1, scope='Conv2d_0a_1x1') + tower_conv1_1 = slim.conv2d(tower_conv1_0, 64, 5, + scope='Conv2d_0b_5x5') + with tf.variable_scope('Branch_2'): + tower_conv2_0 = slim.conv2d(net, 64, 1, scope='Conv2d_0a_1x1') + tower_conv2_1 = slim.conv2d(tower_conv2_0, 96, 3, + scope='Conv2d_0b_3x3') + tower_conv2_2 = slim.conv2d(tower_conv2_1, 96, 3, + scope='Conv2d_0c_3x3') + with tf.variable_scope('Branch_3'): + tower_pool = slim.avg_pool2d(net, 3, stride=1, padding='SAME', + scope='AvgPool_0a_3x3') + tower_pool_1 = slim.conv2d(tower_pool, 64, 1, + scope='Conv2d_0b_1x1') + net = tf.concat(3, [tower_conv, tower_conv1_1, + tower_conv2_2, tower_pool_1]) + + end_points['Mixed_5b'] = net + net = slim.repeat(net, 10, block35, scale=0.17) + + # 17 x 17 x 1024 + with tf.variable_scope('Mixed_6a'): + with tf.variable_scope('Branch_0'): + tower_conv = slim.conv2d(net, 384, 3, stride=2, padding='VALID', + scope='Conv2d_1a_3x3') + with tf.variable_scope('Branch_1'): + tower_conv1_0 = slim.conv2d(net, 256, 1, scope='Conv2d_0a_1x1') + tower_conv1_1 = slim.conv2d(tower_conv1_0, 256, 3, + scope='Conv2d_0b_3x3') + tower_conv1_2 = slim.conv2d(tower_conv1_1, 384, 3, + stride=2, padding='VALID', + scope='Conv2d_1a_3x3') + with tf.variable_scope('Branch_2'): + tower_pool = slim.max_pool2d(net, 3, stride=2, padding='VALID', + scope='MaxPool_1a_3x3') + net = tf.concat(3, [tower_conv, tower_conv1_2, tower_pool]) + + end_points['Mixed_6a'] = net + net = slim.repeat(net, 20, block17, scale=0.10) + + # Auxillary tower + with tf.variable_scope('AuxLogits'): + aux = slim.avg_pool2d(net, 5, stride=3, padding='VALID', + scope='Conv2d_1a_3x3') + aux = slim.conv2d(aux, 128, 1, scope='Conv2d_1b_1x1') + aux = slim.conv2d(aux, 768, aux.get_shape()[1:3], + padding='VALID', scope='Conv2d_2a_5x5') + aux = slim.flatten(aux) + aux = slim.fully_connected(aux, num_classes, activation_fn=None, + scope='Logits') + end_points['AuxLogits'] = aux + + with tf.variable_scope('Mixed_7a'): + with tf.variable_scope('Branch_0'): + tower_conv = slim.conv2d(net, 256, 1, scope='Conv2d_0a_1x1') + tower_conv_1 = slim.conv2d(tower_conv, 384, 3, stride=2, + padding='VALID', scope='Conv2d_1a_3x3') + with tf.variable_scope('Branch_1'): + tower_conv1 = slim.conv2d(net, 256, 1, scope='Conv2d_0a_1x1') + tower_conv1_1 = slim.conv2d(tower_conv1, 288, 3, stride=2, + padding='VALID', scope='Conv2d_1a_3x3') + with tf.variable_scope('Branch_2'): + tower_conv2 = slim.conv2d(net, 256, 1, scope='Conv2d_0a_1x1') + tower_conv2_1 = slim.conv2d(tower_conv2, 288, 3, + scope='Conv2d_0b_3x3') + tower_conv2_2 = slim.conv2d(tower_conv2_1, 320, 3, stride=2, + padding='VALID', scope='Conv2d_1a_3x3') + with tf.variable_scope('Branch_3'): + tower_pool = slim.max_pool2d(net, 3, stride=2, padding='VALID', + scope='MaxPool_1a_3x3') + net = tf.concat(3, [tower_conv_1, tower_conv1_1, + tower_conv2_2, tower_pool]) + + end_points['Mixed_7a'] = net + + net = slim.repeat(net, 9, block8, scale=0.20) + net = block8(net, activation_fn=None) + + net = slim.conv2d(net, 1536, 1, scope='Conv2d_7b_1x1') + end_points['Conv2d_7b_1x1'] = net + + with tf.variable_scope('Logits'): + end_points['PrePool'] = net + net = slim.avg_pool2d(net, net.get_shape()[1:3], padding='VALID', + scope='AvgPool_1a_8x8') + net = slim.flatten(net) + + net = slim.dropout(net, dropout_keep_prob, is_training=is_training, + scope='Dropout') + + end_points['PreLogitsFlatten'] = net + logits = slim.fully_connected(net, num_classes, activation_fn=None, + scope='Logits') + end_points['Logits'] = logits + end_points['Predictions'] = tf.nn.softmax(logits, name='Predictions') + + return logits, end_points +inception_resnet_v2.default_image_size = 299 + + +def inception_resnet_v2_arg_scope(weight_decay=0.00004, + batch_norm_decay=0.9997, + batch_norm_epsilon=0.001): + """Yields the scope with the default parameters for inception_resnet_v2. + + Args: + weight_decay: the weight decay for weights variables. + batch_norm_decay: decay for the moving average of batch_norm momentums. + batch_norm_epsilon: small float added to variance to avoid dividing by zero. + + Returns: + a arg_scope with the parameters needed for inception_resnet_v2. + """ + # Set weight_decay for weights in conv2d and fully_connected layers. + with slim.arg_scope([slim.conv2d, slim.fully_connected], + weights_regularizer=slim.l2_regularizer(weight_decay), + biases_regularizer=slim.l2_regularizer(weight_decay)): + + batch_norm_params = { + 'decay': batch_norm_decay, + 'epsilon': batch_norm_epsilon, + } + # Set activation_fn and parameters for batch_norm. + with slim.arg_scope([slim.conv2d], activation_fn=tf.nn.relu, + normalizer_fn=slim.batch_norm, + normalizer_params=batch_norm_params) as scope: + return scope diff --git a/nets/inception_resnet_v2_test.py b/nets/inception_resnet_v2_test.py new file mode 100644 index 0000000..b74756a --- /dev/null +++ b/nets/inception_resnet_v2_test.py @@ -0,0 +1,136 @@ +# Copyright 2016 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for slim.inception_resnet_v2.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import tensorflow as tf + +from nets import inception + + +class InceptionTest(tf.test.TestCase): + + def testBuildLogits(self): + batch_size = 5 + height, width = 299, 299 + num_classes = 1000 + with self.test_session(): + inputs = tf.random_uniform((batch_size, height, width, 3)) + logits, _ = inception.inception_resnet_v2(inputs, num_classes) + self.assertTrue(logits.op.name.startswith('InceptionResnetV2/Logits')) + self.assertListEqual(logits.get_shape().as_list(), + [batch_size, num_classes]) + + def testBuildEndPoints(self): + batch_size = 5 + height, width = 299, 299 + num_classes = 1000 + with self.test_session(): + inputs = tf.random_uniform((batch_size, height, width, 3)) + _, end_points = inception.inception_resnet_v2(inputs, num_classes) + self.assertTrue('Logits' in end_points) + logits = end_points['Logits'] + self.assertListEqual(logits.get_shape().as_list(), + [batch_size, num_classes]) + self.assertTrue('AuxLogits' in end_points) + aux_logits = end_points['AuxLogits'] + self.assertListEqual(aux_logits.get_shape().as_list(), + [batch_size, num_classes]) + pre_pool = end_points['PrePool'] + self.assertListEqual(pre_pool.get_shape().as_list(), + [batch_size, 8, 8, 1536]) + + def testVariablesSetDevice(self): + batch_size = 5 + height, width = 299, 299 + num_classes = 1000 + with self.test_session(): + inputs = tf.random_uniform((batch_size, height, width, 3)) + # Force all Variables to reside on the device. + with tf.variable_scope('on_cpu'), tf.device('/cpu:0'): + inception.inception_resnet_v2(inputs, num_classes) + with tf.variable_scope('on_gpu'), tf.device('/gpu:0'): + inception.inception_resnet_v2(inputs, num_classes) + for v in tf.get_collection(tf.GraphKeys.VARIABLES, scope='on_cpu'): + self.assertDeviceEqual(v.device, '/cpu:0') + for v in tf.get_collection(tf.GraphKeys.VARIABLES, scope='on_gpu'): + self.assertDeviceEqual(v.device, '/gpu:0') + + def testHalfSizeImages(self): + batch_size = 5 + height, width = 150, 150 + num_classes = 1000 + with self.test_session(): + inputs = tf.random_uniform((batch_size, height, width, 3)) + logits, end_points = inception.inception_resnet_v2(inputs, num_classes) + self.assertTrue(logits.op.name.startswith('InceptionResnetV2/Logits')) + self.assertListEqual(logits.get_shape().as_list(), + [batch_size, num_classes]) + pre_pool = end_points['PrePool'] + self.assertListEqual(pre_pool.get_shape().as_list(), + [batch_size, 3, 3, 1536]) + + def testUnknownBatchSize(self): + batch_size = 1 + height, width = 299, 299 + num_classes = 1000 + with self.test_session() as sess: + inputs = tf.placeholder(tf.float32, (None, height, width, 3)) + logits, _ = inception.inception_resnet_v2(inputs, num_classes) + self.assertTrue(logits.op.name.startswith('InceptionResnetV2/Logits')) + self.assertListEqual(logits.get_shape().as_list(), + [None, num_classes]) + images = tf.random_uniform((batch_size, height, width, 3)) + sess.run(tf.initialize_all_variables()) + output = sess.run(logits, {inputs: images.eval()}) + self.assertEquals(output.shape, (batch_size, num_classes)) + + def testEvaluation(self): + batch_size = 2 + height, width = 299, 299 + num_classes = 1000 + with self.test_session() as sess: + eval_inputs = tf.random_uniform((batch_size, height, width, 3)) + logits, _ = inception.inception_resnet_v2(eval_inputs, + num_classes, + is_training=False) + predictions = tf.argmax(logits, 1) + sess.run(tf.initialize_all_variables()) + output = sess.run(predictions) + self.assertEquals(output.shape, (batch_size,)) + + def testTrainEvalWithReuse(self): + train_batch_size = 5 + eval_batch_size = 2 + height, width = 150, 150 + num_classes = 1000 + with self.test_session() as sess: + train_inputs = tf.random_uniform((train_batch_size, height, width, 3)) + inception.inception_resnet_v2(train_inputs, num_classes) + eval_inputs = tf.random_uniform((eval_batch_size, height, width, 3)) + logits, _ = inception.inception_resnet_v2(eval_inputs, + num_classes, + is_training=False, + reuse=True) + predictions = tf.argmax(logits, 1) + sess.run(tf.initialize_all_variables()) + output = sess.run(predictions) + self.assertEquals(output.shape, (eval_batch_size,)) + + +if __name__ == '__main__': + tf.test.main() diff --git a/nets/inception_v1.py b/nets/inception_v1.py new file mode 100644 index 0000000..400eff6 --- /dev/null +++ b/nets/inception_v1.py @@ -0,0 +1,340 @@ +# Copyright 2016 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Contains the definition for inception v1 classification network.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import tensorflow as tf + +slim = tf.contrib.slim +trunc_normal = lambda stddev: tf.truncated_normal_initializer(0.0, stddev) + + +def inception_v1_base(inputs, + final_endpoint='Mixed_5c', + scope='InceptionV1'): + """Defines the Inception V1 base architecture. + + This architecture is defined in: + Going deeper with convolutions + Christian Szegedy, Wei Liu, Yangqing Jia, Pierre Sermanet, Scott Reed, + Dragomir Anguelov, Dumitru Erhan, Vincent Vanhoucke, Andrew Rabinovich. + http://arxiv.org/pdf/1409.4842v1.pdf. + + Args: + inputs: a tensor of size [batch_size, height, width, channels]. + final_endpoint: specifies the endpoint to construct the network up to. It + can be one of ['Conv2d_1a_7x7', 'MaxPool_2a_3x3', 'Conv2d_2b_1x1', + 'Conv2d_2c_3x3', 'MaxPool_3a_3x3', 'Mixed_3b', 'Mixed_3c', + 'MaxPool_4a_3x3', 'Mixed_4b', 'Mixed_4c', 'Mixed_4d', 'Mixed_4e', + 'Mixed_4f', 'MaxPool_5a_2x2', 'Mixed_5b', 'Mixed_5c'] + scope: Optional variable_scope. + + Returns: + A dictionary from components of the network to the corresponding activation. + + Raises: + ValueError: if final_endpoint is not set to one of the predefined values. + """ + end_points = {} + with tf.variable_scope(scope, 'InceptionV1', [inputs]): + with slim.arg_scope( + [slim.conv2d, slim.fully_connected], + weights_initializer=trunc_normal(0.01)): + with slim.arg_scope([slim.conv2d, slim.max_pool2d], + stride=1, padding='SAME'): + end_point = 'Conv2d_1a_7x7' + net = slim.conv2d(inputs, 64, [7, 7], stride=2, scope=end_point) + end_points[end_point] = net + if final_endpoint == end_point: return net, end_points + end_point = 'MaxPool_2a_3x3' + net = slim.max_pool2d(net, [3, 3], stride=2, scope=end_point) + end_points[end_point] = net + if final_endpoint == end_point: return net, end_points + end_point = 'Conv2d_2b_1x1' + net = slim.conv2d(net, 64, [1, 1], scope=end_point) + end_points[end_point] = net + if final_endpoint == end_point: return net, end_points + end_point = 'Conv2d_2c_3x3' + net = slim.conv2d(net, 192, [3, 3], scope=end_point) + end_points[end_point] = net + if final_endpoint == end_point: return net, end_points + end_point = 'MaxPool_3a_3x3' + net = slim.max_pool2d(net, [3, 3], stride=2, scope=end_point) + end_points[end_point] = net + if final_endpoint == end_point: return net, end_points + + end_point = 'Mixed_3b' + with tf.variable_scope(end_point): + with tf.variable_scope('Branch_0'): + branch_0 = slim.conv2d(net, 64, [1, 1], scope='Conv2d_0a_1x1') + with tf.variable_scope('Branch_1'): + branch_1 = slim.conv2d(net, 96, [1, 1], scope='Conv2d_0a_1x1') + branch_1 = slim.conv2d(branch_1, 128, [3, 3], scope='Conv2d_0b_3x3') + with tf.variable_scope('Branch_2'): + branch_2 = slim.conv2d(net, 16, [1, 1], scope='Conv2d_0a_1x1') + branch_2 = slim.conv2d(branch_2, 32, [3, 3], scope='Conv2d_0b_3x3') + with tf.variable_scope('Branch_3'): + branch_3 = slim.max_pool2d(net, [3, 3], scope='MaxPool_0a_3x3') + branch_3 = slim.conv2d(branch_3, 32, [1, 1], scope='Conv2d_0b_1x1') + net = tf.concat(3, [branch_0, branch_1, branch_2, branch_3]) + end_points[end_point] = net + if final_endpoint == end_point: return net, end_points + + end_point = 'Mixed_3c' + with tf.variable_scope(end_point): + with tf.variable_scope('Branch_0'): + branch_0 = slim.conv2d(net, 128, [1, 1], scope='Conv2d_0a_1x1') + with tf.variable_scope('Branch_1'): + branch_1 = slim.conv2d(net, 128, [1, 1], scope='Conv2d_0a_1x1') + branch_1 = slim.conv2d(branch_1, 192, [3, 3], scope='Conv2d_0b_3x3') + with tf.variable_scope('Branch_2'): + branch_2 = slim.conv2d(net, 32, [1, 1], scope='Conv2d_0a_1x1') + branch_2 = slim.conv2d(branch_2, 96, [3, 3], scope='Conv2d_0b_3x3') + with tf.variable_scope('Branch_3'): + branch_3 = slim.max_pool2d(net, [3, 3], scope='MaxPool_0a_3x3') + branch_3 = slim.conv2d(branch_3, 64, [1, 1], scope='Conv2d_0b_1x1') + net = tf.concat(3, [branch_0, branch_1, branch_2, branch_3]) + end_points[end_point] = net + if final_endpoint == end_point: return net, end_points + + end_point = 'MaxPool_4a_3x3' + net = slim.max_pool2d(net, [3, 3], stride=2, scope=end_point) + end_points[end_point] = net + if final_endpoint == end_point: return net, end_points + + end_point = 'Mixed_4b' + with tf.variable_scope(end_point): + with tf.variable_scope('Branch_0'): + branch_0 = slim.conv2d(net, 192, [1, 1], scope='Conv2d_0a_1x1') + with tf.variable_scope('Branch_1'): + branch_1 = slim.conv2d(net, 96, [1, 1], scope='Conv2d_0a_1x1') + branch_1 = slim.conv2d(branch_1, 208, [3, 3], scope='Conv2d_0b_3x3') + with tf.variable_scope('Branch_2'): + branch_2 = slim.conv2d(net, 16, [1, 1], scope='Conv2d_0a_1x1') + branch_2 = slim.conv2d(branch_2, 48, [3, 3], scope='Conv2d_0b_3x3') + with tf.variable_scope('Branch_3'): + branch_3 = slim.max_pool2d(net, [3, 3], scope='MaxPool_0a_3x3') + branch_3 = slim.conv2d(branch_3, 64, [1, 1], scope='Conv2d_0b_1x1') + net = tf.concat(3, [branch_0, branch_1, branch_2, branch_3]) + end_points[end_point] = net + if final_endpoint == end_point: return net, end_points + + end_point = 'Mixed_4c' + with tf.variable_scope(end_point): + with tf.variable_scope('Branch_0'): + branch_0 = slim.conv2d(net, 160, [1, 1], scope='Conv2d_0a_1x1') + with tf.variable_scope('Branch_1'): + branch_1 = slim.conv2d(net, 112, [1, 1], scope='Conv2d_0a_1x1') + branch_1 = slim.conv2d(branch_1, 224, [3, 3], scope='Conv2d_0b_3x3') + with tf.variable_scope('Branch_2'): + branch_2 = slim.conv2d(net, 24, [1, 1], scope='Conv2d_0a_1x1') + branch_2 = slim.conv2d(branch_2, 64, [3, 3], scope='Conv2d_0b_3x3') + with tf.variable_scope('Branch_3'): + branch_3 = slim.max_pool2d(net, [3, 3], scope='MaxPool_0a_3x3') + branch_3 = slim.conv2d(branch_3, 64, [1, 1], scope='Conv2d_0b_1x1') + net = tf.concat(3, [branch_0, branch_1, branch_2, branch_3]) + end_points[end_point] = net + if final_endpoint == end_point: return net, end_points + + end_point = 'Mixed_4d' + with tf.variable_scope(end_point): + with tf.variable_scope('Branch_0'): + branch_0 = slim.conv2d(net, 128, [1, 1], scope='Conv2d_0a_1x1') + with tf.variable_scope('Branch_1'): + branch_1 = slim.conv2d(net, 128, [1, 1], scope='Conv2d_0a_1x1') + branch_1 = slim.conv2d(branch_1, 256, [3, 3], scope='Conv2d_0b_3x3') + with tf.variable_scope('Branch_2'): + branch_2 = slim.conv2d(net, 24, [1, 1], scope='Conv2d_0a_1x1') + branch_2 = slim.conv2d(branch_2, 64, [3, 3], scope='Conv2d_0b_3x3') + with tf.variable_scope('Branch_3'): + branch_3 = slim.max_pool2d(net, [3, 3], scope='MaxPool_0a_3x3') + branch_3 = slim.conv2d(branch_3, 64, [1, 1], scope='Conv2d_0b_1x1') + net = tf.concat(3, [branch_0, branch_1, branch_2, branch_3]) + end_points[end_point] = net + if final_endpoint == end_point: return net, end_points + + end_point = 'Mixed_4e' + with tf.variable_scope(end_point): + with tf.variable_scope('Branch_0'): + branch_0 = slim.conv2d(net, 112, [1, 1], scope='Conv2d_0a_1x1') + with tf.variable_scope('Branch_1'): + branch_1 = slim.conv2d(net, 144, [1, 1], scope='Conv2d_0a_1x1') + branch_1 = slim.conv2d(branch_1, 288, [3, 3], scope='Conv2d_0b_3x3') + with tf.variable_scope('Branch_2'): + branch_2 = slim.conv2d(net, 32, [1, 1], scope='Conv2d_0a_1x1') + branch_2 = slim.conv2d(branch_2, 64, [3, 3], scope='Conv2d_0b_3x3') + with tf.variable_scope('Branch_3'): + branch_3 = slim.max_pool2d(net, [3, 3], scope='MaxPool_0a_3x3') + branch_3 = slim.conv2d(branch_3, 64, [1, 1], scope='Conv2d_0b_1x1') + net = tf.concat(3, [branch_0, branch_1, branch_2, branch_3]) + end_points[end_point] = net + if final_endpoint == end_point: return net, end_points + + end_point = 'Mixed_4f' + with tf.variable_scope(end_point): + with tf.variable_scope('Branch_0'): + branch_0 = slim.conv2d(net, 256, [1, 1], scope='Conv2d_0a_1x1') + with tf.variable_scope('Branch_1'): + branch_1 = slim.conv2d(net, 160, [1, 1], scope='Conv2d_0a_1x1') + branch_1 = slim.conv2d(branch_1, 320, [3, 3], scope='Conv2d_0b_3x3') + with tf.variable_scope('Branch_2'): + branch_2 = slim.conv2d(net, 32, [1, 1], scope='Conv2d_0a_1x1') + branch_2 = slim.conv2d(branch_2, 128, [3, 3], scope='Conv2d_0b_3x3') + with tf.variable_scope('Branch_3'): + branch_3 = slim.max_pool2d(net, [3, 3], scope='MaxPool_0a_3x3') + branch_3 = slim.conv2d(branch_3, 128, [1, 1], scope='Conv2d_0b_1x1') + net = tf.concat(3, [branch_0, branch_1, branch_2, branch_3]) + end_points[end_point] = net + if final_endpoint == end_point: return net, end_points + + end_point = 'MaxPool_5a_2x2' + net = slim.max_pool2d(net, [2, 2], stride=2, scope=end_point) + end_points[end_point] = net + if final_endpoint == end_point: return net, end_points + + end_point = 'Mixed_5b' + with tf.variable_scope(end_point): + with tf.variable_scope('Branch_0'): + branch_0 = slim.conv2d(net, 256, [1, 1], scope='Conv2d_0a_1x1') + with tf.variable_scope('Branch_1'): + branch_1 = slim.conv2d(net, 160, [1, 1], scope='Conv2d_0a_1x1') + branch_1 = slim.conv2d(branch_1, 320, [3, 3], scope='Conv2d_0b_3x3') + with tf.variable_scope('Branch_2'): + branch_2 = slim.conv2d(net, 32, [1, 1], scope='Conv2d_0a_1x1') + branch_2 = slim.conv2d(branch_2, 128, [3, 3], scope='Conv2d_0a_3x3') + with tf.variable_scope('Branch_3'): + branch_3 = slim.max_pool2d(net, [3, 3], scope='MaxPool_0a_3x3') + branch_3 = slim.conv2d(branch_3, 128, [1, 1], scope='Conv2d_0b_1x1') + net = tf.concat(3, [branch_0, branch_1, branch_2, branch_3]) + end_points[end_point] = net + if final_endpoint == end_point: return net, end_points + + end_point = 'Mixed_5c' + with tf.variable_scope(end_point): + with tf.variable_scope('Branch_0'): + branch_0 = slim.conv2d(net, 384, [1, 1], scope='Conv2d_0a_1x1') + with tf.variable_scope('Branch_1'): + branch_1 = slim.conv2d(net, 192, [1, 1], scope='Conv2d_0a_1x1') + branch_1 = slim.conv2d(branch_1, 384, [3, 3], scope='Conv2d_0b_3x3') + with tf.variable_scope('Branch_2'): + branch_2 = slim.conv2d(net, 48, [1, 1], scope='Conv2d_0a_1x1') + branch_2 = slim.conv2d(branch_2, 128, [3, 3], scope='Conv2d_0b_3x3') + with tf.variable_scope('Branch_3'): + branch_3 = slim.max_pool2d(net, [3, 3], scope='MaxPool_0a_3x3') + branch_3 = slim.conv2d(branch_3, 128, [1, 1], scope='Conv2d_0b_1x1') + net = tf.concat(3, [branch_0, branch_1, branch_2, branch_3]) + end_points[end_point] = net + if final_endpoint == end_point: return net, end_points + raise ValueError('Unknown final endpoint %s' % final_endpoint) + + +def inception_v1(inputs, + num_classes=1000, + is_training=True, + dropout_keep_prob=0.8, + prediction_fn=slim.softmax, + spatial_squeeze=True, + reuse=None, + scope='InceptionV1'): + """Defines the Inception V1 architecture. + + This architecture is defined in: + + Going deeper with convolutions + Christian Szegedy, Wei Liu, Yangqing Jia, Pierre Sermanet, Scott Reed, + Dragomir Anguelov, Dumitru Erhan, Vincent Vanhoucke, Andrew Rabinovich. + http://arxiv.org/pdf/1409.4842v1.pdf. + + The default image size used to train this network is 224x224. + + Args: + inputs: a tensor of size [batch_size, height, width, channels]. + num_classes: number of predicted classes. + is_training: whether is training or not. + dropout_keep_prob: the percentage of activation values that are retained. + prediction_fn: a function to get predictions out of logits. + spatial_squeeze: if True, logits is of shape is [B, C], if false logits is + of shape [B, 1, 1, C], where B is batch_size and C is number of classes. + reuse: whether or not the network and its variables should be reused. To be + able to reuse 'scope' must be given. + scope: Optional variable_scope. + + Returns: + logits: the pre-softmax activations, a tensor of size + [batch_size, num_classes] + end_points: a dictionary from components of the network to the corresponding + activation. + """ + # Final pooling and prediction + with tf.variable_scope(scope, 'InceptionV1', [inputs, num_classes], + reuse=reuse) as scope: + with slim.arg_scope([slim.batch_norm, slim.dropout], + is_training=is_training): + net, end_points = inception_v1_base(inputs, scope=scope) + with tf.variable_scope('Logits'): + net = slim.avg_pool2d(net, [7, 7], stride=1, scope='MaxPool_0a_7x7') + net = slim.dropout(net, + dropout_keep_prob, scope='Dropout_0b') + logits = slim.conv2d(net, num_classes, [1, 1], activation_fn=None, + normalizer_fn=None, scope='Conv2d_0c_1x1') + if spatial_squeeze: + logits = tf.squeeze(logits, [1, 2], name='SpatialSqueeze') + + end_points['Logits'] = logits + end_points['Predictions'] = prediction_fn(logits, scope='Predictions') + return logits, end_points +inception_v1.default_image_size = 224 + + +def inception_v1_arg_scope(weight_decay=0.00004, + use_batch_norm=True): + """Defines the default InceptionV1 arg scope. + + Note: Althougth the original paper didn't use batch_norm we found it useful. + + Args: + weight_decay: The weight decay to use for regularizing the model. + use_batch_norm: "If `True`, batch_norm is applied after each convolution. + + Returns: + An `arg_scope` to use for the inception v3 model. + """ + batch_norm_params = { + # Decay for the moving averages. + 'decay': 0.9997, + # epsilon to prevent 0s in variance. + 'epsilon': 0.001, + # collection containing update_ops. + 'updates_collections': tf.GraphKeys.UPDATE_OPS, + } + if use_batch_norm: + normalizer_fn = slim.batch_norm + normalizer_params = batch_norm_params + else: + normalizer_fn = None + normalizer_params = {} + # Set weight_decay for weights in Conv and FC layers. + with slim.arg_scope([slim.conv2d, slim.fully_connected], + weights_regularizer=slim.l2_regularizer(weight_decay)): + with slim.arg_scope( + [slim.conv2d], + weights_initializer=slim.variance_scaling_initializer(), + activation_fn=tf.nn.relu, + normalizer_fn=normalizer_fn, + normalizer_params=normalizer_params) as sc: + return sc diff --git a/nets/inception_v1_test.py b/nets/inception_v1_test.py new file mode 100644 index 0000000..a769538 --- /dev/null +++ b/nets/inception_v1_test.py @@ -0,0 +1,210 @@ +# Copyright 2016 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for nets.inception_v1.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +import tensorflow as tf + +from nets import inception + +slim = tf.contrib.slim + + +class InceptionV1Test(tf.test.TestCase): + + def testBuildClassificationNetwork(self): + batch_size = 5 + height, width = 224, 224 + num_classes = 1000 + + inputs = tf.random_uniform((batch_size, height, width, 3)) + logits, end_points = inception.inception_v1(inputs, num_classes) + self.assertTrue(logits.op.name.startswith('InceptionV1/Logits')) + self.assertListEqual(logits.get_shape().as_list(), + [batch_size, num_classes]) + self.assertTrue('Predictions' in end_points) + self.assertListEqual(end_points['Predictions'].get_shape().as_list(), + [batch_size, num_classes]) + + def testBuildBaseNetwork(self): + batch_size = 5 + height, width = 224, 224 + + inputs = tf.random_uniform((batch_size, height, width, 3)) + mixed_6c, end_points = inception.inception_v1_base(inputs) + self.assertTrue(mixed_6c.op.name.startswith('InceptionV1/Mixed_5c')) + self.assertListEqual(mixed_6c.get_shape().as_list(), + [batch_size, 7, 7, 1024]) + expected_endpoints = ['Conv2d_1a_7x7', 'MaxPool_2a_3x3', 'Conv2d_2b_1x1', + 'Conv2d_2c_3x3', 'MaxPool_3a_3x3', 'Mixed_3b', + 'Mixed_3c', 'MaxPool_4a_3x3', 'Mixed_4b', 'Mixed_4c', + 'Mixed_4d', 'Mixed_4e', 'Mixed_4f', 'MaxPool_5a_2x2', + 'Mixed_5b', 'Mixed_5c'] + self.assertItemsEqual(end_points.keys(), expected_endpoints) + + def testBuildOnlyUptoFinalEndpoint(self): + batch_size = 5 + height, width = 224, 224 + endpoints = ['Conv2d_1a_7x7', 'MaxPool_2a_3x3', 'Conv2d_2b_1x1', + 'Conv2d_2c_3x3', 'MaxPool_3a_3x3', 'Mixed_3b', 'Mixed_3c', + 'MaxPool_4a_3x3', 'Mixed_4b', 'Mixed_4c', 'Mixed_4d', + 'Mixed_4e', 'Mixed_4f', 'MaxPool_5a_2x2', 'Mixed_5b', + 'Mixed_5c'] + for index, endpoint in enumerate(endpoints): + with tf.Graph().as_default(): + inputs = tf.random_uniform((batch_size, height, width, 3)) + out_tensor, end_points = inception.inception_v1_base( + inputs, final_endpoint=endpoint) + self.assertTrue(out_tensor.op.name.startswith( + 'InceptionV1/' + endpoint)) + self.assertItemsEqual(endpoints[:index+1], end_points) + + def testBuildAndCheckAllEndPointsUptoMixed5c(self): + batch_size = 5 + height, width = 224, 224 + + inputs = tf.random_uniform((batch_size, height, width, 3)) + _, end_points = inception.inception_v1_base(inputs, + final_endpoint='Mixed_5c') + endpoints_shapes = {'Conv2d_1a_7x7': [5, 112, 112, 64], + 'MaxPool_2a_3x3': [5, 56, 56, 64], + 'Conv2d_2b_1x1': [5, 56, 56, 64], + 'Conv2d_2c_3x3': [5, 56, 56, 192], + 'MaxPool_3a_3x3': [5, 28, 28, 192], + 'Mixed_3b': [5, 28, 28, 256], + 'Mixed_3c': [5, 28, 28, 480], + 'MaxPool_4a_3x3': [5, 14, 14, 480], + 'Mixed_4b': [5, 14, 14, 512], + 'Mixed_4c': [5, 14, 14, 512], + 'Mixed_4d': [5, 14, 14, 512], + 'Mixed_4e': [5, 14, 14, 528], + 'Mixed_4f': [5, 14, 14, 832], + 'MaxPool_5a_2x2': [5, 7, 7, 832], + 'Mixed_5b': [5, 7, 7, 832], + 'Mixed_5c': [5, 7, 7, 1024]} + + self.assertItemsEqual(endpoints_shapes.keys(), end_points.keys()) + for endpoint_name in endpoints_shapes: + expected_shape = endpoints_shapes[endpoint_name] + self.assertTrue(endpoint_name in end_points) + self.assertListEqual(end_points[endpoint_name].get_shape().as_list(), + expected_shape) + + def testModelHasExpectedNumberOfParameters(self): + batch_size = 5 + height, width = 224, 224 + inputs = tf.random_uniform((batch_size, height, width, 3)) + with slim.arg_scope(inception.inception_v1_arg_scope()): + inception.inception_v1_base(inputs) + total_params, _ = slim.model_analyzer.analyze_vars( + slim.get_model_variables()) + self.assertAlmostEqual(5607184, total_params) + + def testHalfSizeImages(self): + batch_size = 5 + height, width = 112, 112 + + inputs = tf.random_uniform((batch_size, height, width, 3)) + mixed_5c, _ = inception.inception_v1_base(inputs) + self.assertTrue(mixed_5c.op.name.startswith('InceptionV1/Mixed_5c')) + self.assertListEqual(mixed_5c.get_shape().as_list(), + [batch_size, 4, 4, 1024]) + + def testUnknownImageShape(self): + tf.reset_default_graph() + batch_size = 2 + height, width = 224, 224 + num_classes = 1000 + input_np = np.random.uniform(0, 1, (batch_size, height, width, 3)) + with self.test_session() as sess: + inputs = tf.placeholder(tf.float32, shape=(batch_size, None, None, 3)) + logits, end_points = inception.inception_v1(inputs, num_classes) + self.assertTrue(logits.op.name.startswith('InceptionV1/Logits')) + self.assertListEqual(logits.get_shape().as_list(), + [batch_size, num_classes]) + pre_pool = end_points['Mixed_5c'] + feed_dict = {inputs: input_np} + tf.initialize_all_variables().run() + pre_pool_out = sess.run(pre_pool, feed_dict=feed_dict) + self.assertListEqual(list(pre_pool_out.shape), [batch_size, 7, 7, 1024]) + + def testUnknowBatchSize(self): + batch_size = 1 + height, width = 224, 224 + num_classes = 1000 + + inputs = tf.placeholder(tf.float32, (None, height, width, 3)) + logits, _ = inception.inception_v1(inputs, num_classes) + self.assertTrue(logits.op.name.startswith('InceptionV1/Logits')) + self.assertListEqual(logits.get_shape().as_list(), + [None, num_classes]) + images = tf.random_uniform((batch_size, height, width, 3)) + + with self.test_session() as sess: + sess.run(tf.initialize_all_variables()) + output = sess.run(logits, {inputs: images.eval()}) + self.assertEquals(output.shape, (batch_size, num_classes)) + + def testEvaluation(self): + batch_size = 2 + height, width = 224, 224 + num_classes = 1000 + + eval_inputs = tf.random_uniform((batch_size, height, width, 3)) + logits, _ = inception.inception_v1(eval_inputs, num_classes, + is_training=False) + predictions = tf.argmax(logits, 1) + + with self.test_session() as sess: + sess.run(tf.initialize_all_variables()) + output = sess.run(predictions) + self.assertEquals(output.shape, (batch_size,)) + + def testTrainEvalWithReuse(self): + train_batch_size = 5 + eval_batch_size = 2 + height, width = 224, 224 + num_classes = 1000 + + train_inputs = tf.random_uniform((train_batch_size, height, width, 3)) + inception.inception_v1(train_inputs, num_classes) + eval_inputs = tf.random_uniform((eval_batch_size, height, width, 3)) + logits, _ = inception.inception_v1(eval_inputs, num_classes, reuse=True) + predictions = tf.argmax(logits, 1) + + with self.test_session() as sess: + sess.run(tf.initialize_all_variables()) + output = sess.run(predictions) + self.assertEquals(output.shape, (eval_batch_size,)) + + def testLogitsNotSqueezed(self): + num_classes = 25 + images = tf.random_uniform([1, 224, 224, 3]) + logits, _ = inception.inception_v1(images, + num_classes=num_classes, + spatial_squeeze=False) + + with self.test_session() as sess: + tf.initialize_all_variables().run() + logits_out = sess.run(logits) + self.assertListEqual(list(logits_out.shape), [1, 1, 1, num_classes]) + + +if __name__ == '__main__': + tf.test.main() diff --git a/nets/inception_v2.py b/nets/inception_v2.py new file mode 100644 index 0000000..914c4b2 --- /dev/null +++ b/nets/inception_v2.py @@ -0,0 +1,545 @@ +# Copyright 2016 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Contains the definition for inception v2 classification network.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import tensorflow as tf + +slim = tf.contrib.slim +trunc_normal = lambda stddev: tf.truncated_normal_initializer(0.0, stddev) + + +def inception_v2_base(inputs, + final_endpoint='Mixed_5c', + min_depth=16, + depth_multiplier=1.0, + scope=None): + """Inception v2 (6a2). + + Constructs an Inception v2 network from inputs to the given final endpoint. + This method can construct the network up to the layer inception(5b) as + described in http://arxiv.org/abs/1502.03167. + + Args: + inputs: a tensor of shape [batch_size, height, width, channels]. + final_endpoint: specifies the endpoint to construct the network up to. It + can be one of ['Conv2d_1a_7x7', 'MaxPool_2a_3x3', 'Conv2d_2b_1x1', + 'Conv2d_2c_3x3', 'MaxPool_3a_3x3', 'Mixed_3b', 'Mixed_3c', 'Mixed_4a', + 'Mixed_4b', 'Mixed_4c', 'Mixed_4d', 'Mixed_4e', 'Mixed_5a', 'Mixed_5b', + 'Mixed_5c']. + min_depth: Minimum depth value (number of channels) for all convolution ops. + Enforced when depth_multiplier < 1, and not an active constraint when + depth_multiplier >= 1. + depth_multiplier: Float multiplier for the depth (number of channels) + for all convolution ops. The value must be greater than zero. Typical + usage will be to set this value in (0, 1) to reduce the number of + parameters or computation cost of the model. + scope: Optional variable_scope. + + Returns: + tensor_out: output tensor corresponding to the final_endpoint. + end_points: a set of activations for external use, for example summaries or + losses. + + Raises: + ValueError: if final_endpoint is not set to one of the predefined values, + or depth_multiplier <= 0 + """ + + # end_points will collect relevant activations for external use, for example + # summaries or losses. + end_points = {} + + # Used to find thinned depths for each layer. + if depth_multiplier <= 0: + raise ValueError('depth_multiplier is not greater than zero.') + depth = lambda d: max(int(d * depth_multiplier), min_depth) + + with tf.variable_scope(scope, 'InceptionV2', [inputs]): + with slim.arg_scope( + [slim.conv2d, slim.max_pool2d, slim.avg_pool2d, slim.separable_conv2d], + stride=1, padding='SAME'): + + # Note that sizes in the comments below assume an input spatial size of + # 224x224, however, the inputs can be of any size greater 32x32. + + # 224 x 224 x 3 + end_point = 'Conv2d_1a_7x7' + # depthwise_multiplier here is different from depth_multiplier. + # depthwise_multiplier determines the output channels of the initial + # depthwise conv (see docs for tf.nn.separable_conv2d), while + # depth_multiplier controls the # channels of the subsequent 1x1 + # convolution. Must have + # in_channels * depthwise_multipler <= out_channels + # so that the separable convolution is not overparameterized. + depthwise_multiplier = min(int(depth(64) / 3), 8) + net = slim.separable_conv2d( + inputs, depth(64), [7, 7], depth_multiplier=depthwise_multiplier, + stride=2, weights_initializer=trunc_normal(1.0), + scope=end_point) + end_points[end_point] = net + if end_point == final_endpoint: return net, end_points + # 112 x 112 x 64 + end_point = 'MaxPool_2a_3x3' + net = slim.max_pool2d(net, [3, 3], scope=end_point, stride=2) + end_points[end_point] = net + if end_point == final_endpoint: return net, end_points + # 56 x 56 x 64 + end_point = 'Conv2d_2b_1x1' + net = slim.conv2d(net, depth(64), [1, 1], scope=end_point, + weights_initializer=trunc_normal(0.1)) + end_points[end_point] = net + if end_point == final_endpoint: return net, end_points + # 56 x 56 x 64 + end_point = 'Conv2d_2c_3x3' + net = slim.conv2d(net, depth(192), [3, 3], scope=end_point) + end_points[end_point] = net + if end_point == final_endpoint: return net, end_points + # 56 x 56 x 192 + end_point = 'MaxPool_3a_3x3' + net = slim.max_pool2d(net, [3, 3], scope=end_point, stride=2) + end_points[end_point] = net + if end_point == final_endpoint: return net, end_points + # 28 x 28 x 192 + # Inception module. + end_point = 'Mixed_3b' + with tf.variable_scope(end_point): + with tf.variable_scope('Branch_0'): + branch_0 = slim.conv2d(net, depth(64), [1, 1], scope='Conv2d_0a_1x1') + with tf.variable_scope('Branch_1'): + branch_1 = slim.conv2d( + net, depth(64), [1, 1], + weights_initializer=trunc_normal(0.09), + scope='Conv2d_0a_1x1') + branch_1 = slim.conv2d(branch_1, depth(64), [3, 3], + scope='Conv2d_0b_3x3') + with tf.variable_scope('Branch_2'): + branch_2 = slim.conv2d( + net, depth(64), [1, 1], + weights_initializer=trunc_normal(0.09), + scope='Conv2d_0a_1x1') + branch_2 = slim.conv2d(branch_2, depth(96), [3, 3], + scope='Conv2d_0b_3x3') + branch_2 = slim.conv2d(branch_2, depth(96), [3, 3], + scope='Conv2d_0c_3x3') + with tf.variable_scope('Branch_3'): + branch_3 = slim.avg_pool2d(net, [3, 3], scope='AvgPool_0a_3x3') + branch_3 = slim.conv2d( + branch_3, depth(32), [1, 1], + weights_initializer=trunc_normal(0.1), + scope='Conv2d_0b_1x1') + net = tf.concat(3, [branch_0, branch_1, branch_2, branch_3]) + end_points[end_point] = net + if end_point == final_endpoint: return net, end_points + # 28 x 28 x 256 + end_point = 'Mixed_3c' + with tf.variable_scope(end_point): + with tf.variable_scope('Branch_0'): + branch_0 = slim.conv2d(net, depth(64), [1, 1], scope='Conv2d_0a_1x1') + with tf.variable_scope('Branch_1'): + branch_1 = slim.conv2d( + net, depth(64), [1, 1], + weights_initializer=trunc_normal(0.09), + scope='Conv2d_0a_1x1') + branch_1 = slim.conv2d(branch_1, depth(96), [3, 3], + scope='Conv2d_0b_3x3') + with tf.variable_scope('Branch_2'): + branch_2 = slim.conv2d( + net, depth(64), [1, 1], + weights_initializer=trunc_normal(0.09), + scope='Conv2d_0a_1x1') + branch_2 = slim.conv2d(branch_2, depth(96), [3, 3], + scope='Conv2d_0b_3x3') + branch_2 = slim.conv2d(branch_2, depth(96), [3, 3], + scope='Conv2d_0c_3x3') + with tf.variable_scope('Branch_3'): + branch_3 = slim.avg_pool2d(net, [3, 3], scope='AvgPool_0a_3x3') + branch_3 = slim.conv2d( + branch_3, depth(64), [1, 1], + weights_initializer=trunc_normal(0.1), + scope='Conv2d_0b_1x1') + net = tf.concat(3, [branch_0, branch_1, branch_2, branch_3]) + end_points[end_point] = net + if end_point == final_endpoint: return net, end_points + # 28 x 28 x 320 + end_point = 'Mixed_4a' + with tf.variable_scope(end_point): + with tf.variable_scope('Branch_0'): + branch_0 = slim.conv2d( + net, depth(128), [1, 1], + weights_initializer=trunc_normal(0.09), + scope='Conv2d_0a_1x1') + branch_0 = slim.conv2d(branch_0, depth(160), [3, 3], stride=2, + scope='Conv2d_1a_3x3') + with tf.variable_scope('Branch_1'): + branch_1 = slim.conv2d( + net, depth(64), [1, 1], + weights_initializer=trunc_normal(0.09), + scope='Conv2d_0a_1x1') + branch_1 = slim.conv2d( + branch_1, depth(96), [3, 3], scope='Conv2d_0b_3x3') + branch_1 = slim.conv2d( + branch_1, depth(96), [3, 3], stride=2, scope='Conv2d_1a_3x3') + with tf.variable_scope('Branch_2'): + branch_2 = slim.max_pool2d( + net, [3, 3], stride=2, scope='MaxPool_1a_3x3') + net = tf.concat(3, [branch_0, branch_1, branch_2]) + end_points[end_point] = net + if end_point == final_endpoint: return net, end_points + # 14 x 14 x 576 + end_point = 'Mixed_4b' + with tf.variable_scope(end_point): + with tf.variable_scope('Branch_0'): + branch_0 = slim.conv2d(net, depth(224), [1, 1], scope='Conv2d_0a_1x1') + with tf.variable_scope('Branch_1'): + branch_1 = slim.conv2d( + net, depth(64), [1, 1], + weights_initializer=trunc_normal(0.09), + scope='Conv2d_0a_1x1') + branch_1 = slim.conv2d( + branch_1, depth(96), [3, 3], scope='Conv2d_0b_3x3') + with tf.variable_scope('Branch_2'): + branch_2 = slim.conv2d( + net, depth(96), [1, 1], + weights_initializer=trunc_normal(0.09), + scope='Conv2d_0a_1x1') + branch_2 = slim.conv2d(branch_2, depth(128), [3, 3], + scope='Conv2d_0b_3x3') + branch_2 = slim.conv2d(branch_2, depth(128), [3, 3], + scope='Conv2d_0c_3x3') + with tf.variable_scope('Branch_3'): + branch_3 = slim.avg_pool2d(net, [3, 3], scope='AvgPool_0a_3x3') + branch_3 = slim.conv2d( + branch_3, depth(128), [1, 1], + weights_initializer=trunc_normal(0.1), + scope='Conv2d_0b_1x1') + net = tf.concat(3, [branch_0, branch_1, branch_2, branch_3]) + end_points[end_point] = net + if end_point == final_endpoint: return net, end_points + # 14 x 14 x 576 + end_point = 'Mixed_4c' + with tf.variable_scope(end_point): + with tf.variable_scope('Branch_0'): + branch_0 = slim.conv2d(net, depth(192), [1, 1], scope='Conv2d_0a_1x1') + with tf.variable_scope('Branch_1'): + branch_1 = slim.conv2d( + net, depth(96), [1, 1], + weights_initializer=trunc_normal(0.09), + scope='Conv2d_0a_1x1') + branch_1 = slim.conv2d(branch_1, depth(128), [3, 3], + scope='Conv2d_0b_3x3') + with tf.variable_scope('Branch_2'): + branch_2 = slim.conv2d( + net, depth(96), [1, 1], + weights_initializer=trunc_normal(0.09), + scope='Conv2d_0a_1x1') + branch_2 = slim.conv2d(branch_2, depth(128), [3, 3], + scope='Conv2d_0b_3x3') + branch_2 = slim.conv2d(branch_2, depth(128), [3, 3], + scope='Conv2d_0c_3x3') + with tf.variable_scope('Branch_3'): + branch_3 = slim.avg_pool2d(net, [3, 3], scope='AvgPool_0a_3x3') + branch_3 = slim.conv2d( + branch_3, depth(128), [1, 1], + weights_initializer=trunc_normal(0.1), + scope='Conv2d_0b_1x1') + net = tf.concat(3, [branch_0, branch_1, branch_2, branch_3]) + end_points[end_point] = net + if end_point == final_endpoint: return net, end_points + # 14 x 14 x 576 + end_point = 'Mixed_4d' + with tf.variable_scope(end_point): + with tf.variable_scope('Branch_0'): + branch_0 = slim.conv2d(net, depth(160), [1, 1], scope='Conv2d_0a_1x1') + with tf.variable_scope('Branch_1'): + branch_1 = slim.conv2d( + net, depth(128), [1, 1], + weights_initializer=trunc_normal(0.09), + scope='Conv2d_0a_1x1') + branch_1 = slim.conv2d(branch_1, depth(160), [3, 3], + scope='Conv2d_0b_3x3') + with tf.variable_scope('Branch_2'): + branch_2 = slim.conv2d( + net, depth(128), [1, 1], + weights_initializer=trunc_normal(0.09), + scope='Conv2d_0a_1x1') + branch_2 = slim.conv2d(branch_2, depth(160), [3, 3], + scope='Conv2d_0b_3x3') + branch_2 = slim.conv2d(branch_2, depth(160), [3, 3], + scope='Conv2d_0c_3x3') + with tf.variable_scope('Branch_3'): + branch_3 = slim.avg_pool2d(net, [3, 3], scope='AvgPool_0a_3x3') + branch_3 = slim.conv2d( + branch_3, depth(96), [1, 1], + weights_initializer=trunc_normal(0.1), + scope='Conv2d_0b_1x1') + net = tf.concat(3, [branch_0, branch_1, branch_2, branch_3]) + end_points[end_point] = net + if end_point == final_endpoint: return net, end_points + + # 14 x 14 x 576 + end_point = 'Mixed_4e' + with tf.variable_scope(end_point): + with tf.variable_scope('Branch_0'): + branch_0 = slim.conv2d(net, depth(96), [1, 1], scope='Conv2d_0a_1x1') + with tf.variable_scope('Branch_1'): + branch_1 = slim.conv2d( + net, depth(128), [1, 1], + weights_initializer=trunc_normal(0.09), + scope='Conv2d_0a_1x1') + branch_1 = slim.conv2d(branch_1, depth(192), [3, 3], + scope='Conv2d_0b_3x3') + with tf.variable_scope('Branch_2'): + branch_2 = slim.conv2d( + net, depth(160), [1, 1], + weights_initializer=trunc_normal(0.09), + scope='Conv2d_0a_1x1') + branch_2 = slim.conv2d(branch_2, depth(192), [3, 3], + scope='Conv2d_0b_3x3') + branch_2 = slim.conv2d(branch_2, depth(192), [3, 3], + scope='Conv2d_0c_3x3') + with tf.variable_scope('Branch_3'): + branch_3 = slim.avg_pool2d(net, [3, 3], scope='AvgPool_0a_3x3') + branch_3 = slim.conv2d( + branch_3, depth(96), [1, 1], + weights_initializer=trunc_normal(0.1), + scope='Conv2d_0b_1x1') + net = tf.concat(3, [branch_0, branch_1, branch_2, branch_3]) + end_points[end_point] = net + if end_point == final_endpoint: return net, end_points + # 14 x 14 x 576 + end_point = 'Mixed_5a' + with tf.variable_scope(end_point): + with tf.variable_scope('Branch_0'): + branch_0 = slim.conv2d( + net, depth(128), [1, 1], + weights_initializer=trunc_normal(0.09), + scope='Conv2d_0a_1x1') + branch_0 = slim.conv2d(branch_0, depth(192), [3, 3], stride=2, + scope='Conv2d_1a_3x3') + with tf.variable_scope('Branch_1'): + branch_1 = slim.conv2d( + net, depth(192), [1, 1], + weights_initializer=trunc_normal(0.09), + scope='Conv2d_0a_1x1') + branch_1 = slim.conv2d(branch_1, depth(256), [3, 3], + scope='Conv2d_0b_3x3') + branch_1 = slim.conv2d(branch_1, depth(256), [3, 3], stride=2, + scope='Conv2d_1a_3x3') + with tf.variable_scope('Branch_2'): + branch_2 = slim.max_pool2d(net, [3, 3], stride=2, + scope='MaxPool_1a_3x3') + net = tf.concat(3, [branch_0, branch_1, branch_2]) + end_points[end_point] = net + if end_point == final_endpoint: return net, end_points + # 7 x 7 x 1024 + end_point = 'Mixed_5b' + with tf.variable_scope(end_point): + with tf.variable_scope('Branch_0'): + branch_0 = slim.conv2d(net, depth(352), [1, 1], scope='Conv2d_0a_1x1') + with tf.variable_scope('Branch_1'): + branch_1 = slim.conv2d( + net, depth(192), [1, 1], + weights_initializer=trunc_normal(0.09), + scope='Conv2d_0a_1x1') + branch_1 = slim.conv2d(branch_1, depth(320), [3, 3], + scope='Conv2d_0b_3x3') + with tf.variable_scope('Branch_2'): + branch_2 = slim.conv2d( + net, depth(160), [1, 1], + weights_initializer=trunc_normal(0.09), + scope='Conv2d_0a_1x1') + branch_2 = slim.conv2d(branch_2, depth(224), [3, 3], + scope='Conv2d_0b_3x3') + branch_2 = slim.conv2d(branch_2, depth(224), [3, 3], + scope='Conv2d_0c_3x3') + with tf.variable_scope('Branch_3'): + branch_3 = slim.avg_pool2d(net, [3, 3], scope='AvgPool_0a_3x3') + branch_3 = slim.conv2d( + branch_3, depth(128), [1, 1], + weights_initializer=trunc_normal(0.1), + scope='Conv2d_0b_1x1') + net = tf.concat(3, [branch_0, branch_1, branch_2, branch_3]) + end_points[end_point] = net + if end_point == final_endpoint: return net, end_points + + # 7 x 7 x 1024 + end_point = 'Mixed_5c' + with tf.variable_scope(end_point): + with tf.variable_scope('Branch_0'): + branch_0 = slim.conv2d(net, depth(352), [1, 1], scope='Conv2d_0a_1x1') + with tf.variable_scope('Branch_1'): + branch_1 = slim.conv2d( + net, depth(192), [1, 1], + weights_initializer=trunc_normal(0.09), + scope='Conv2d_0a_1x1') + branch_1 = slim.conv2d(branch_1, depth(320), [3, 3], + scope='Conv2d_0b_3x3') + with tf.variable_scope('Branch_2'): + branch_2 = slim.conv2d( + net, depth(192), [1, 1], + weights_initializer=trunc_normal(0.09), + scope='Conv2d_0a_1x1') + branch_2 = slim.conv2d(branch_2, depth(224), [3, 3], + scope='Conv2d_0b_3x3') + branch_2 = slim.conv2d(branch_2, depth(224), [3, 3], + scope='Conv2d_0c_3x3') + with tf.variable_scope('Branch_3'): + branch_3 = slim.max_pool2d(net, [3, 3], scope='MaxPool_0a_3x3') + branch_3 = slim.conv2d( + branch_3, depth(128), [1, 1], + weights_initializer=trunc_normal(0.1), + scope='Conv2d_0b_1x1') + net = tf.concat(3, [branch_0, branch_1, branch_2, branch_3]) + end_points[end_point] = net + if end_point == final_endpoint: return net, end_points + raise ValueError('Unknown final endpoint %s' % final_endpoint) + + +def inception_v2(inputs, + num_classes=1000, + is_training=True, + dropout_keep_prob=0.8, + min_depth=16, + depth_multiplier=1.0, + prediction_fn=slim.softmax, + spatial_squeeze=True, + reuse=None, + scope='InceptionV2'): + """Inception v2 model for classification. + + Constructs an Inception v2 network for classification as described in + http://arxiv.org/abs/1502.03167. + + The default image size used to train this network is 224x224. + + Args: + inputs: a tensor of shape [batch_size, height, width, channels]. + num_classes: number of predicted classes. + is_training: whether is training or not. + dropout_keep_prob: the percentage of activation values that are retained. + min_depth: Minimum depth value (number of channels) for all convolution ops. + Enforced when depth_multiplier < 1, and not an active constraint when + depth_multiplier >= 1. + depth_multiplier: Float multiplier for the depth (number of channels) + for all convolution ops. The value must be greater than zero. Typical + usage will be to set this value in (0, 1) to reduce the number of + parameters or computation cost of the model. + prediction_fn: a function to get predictions out of logits. + spatial_squeeze: if True, logits is of shape is [B, C], if false logits is + of shape [B, 1, 1, C], where B is batch_size and C is number of classes. + reuse: whether or not the network and its variables should be reused. To be + able to reuse 'scope' must be given. + scope: Optional variable_scope. + + Returns: + logits: the pre-softmax activations, a tensor of size + [batch_size, num_classes] + end_points: a dictionary from components of the network to the corresponding + activation. + + Raises: + ValueError: if final_endpoint is not set to one of the predefined values, + or depth_multiplier <= 0 + """ + if depth_multiplier <= 0: + raise ValueError('depth_multiplier is not greater than zero.') + + # Final pooling and prediction + with tf.variable_scope(scope, 'InceptionV2', [inputs, num_classes], + reuse=reuse) as scope: + with slim.arg_scope([slim.batch_norm, slim.dropout], + is_training=is_training): + net, end_points = inception_v2_base( + inputs, scope=scope, min_depth=min_depth, + depth_multiplier=depth_multiplier) + with tf.variable_scope('Logits'): + kernel_size = _reduced_kernel_size_for_small_input(net, [7, 7]) + net = slim.avg_pool2d(net, kernel_size, padding='VALID', + scope='AvgPool_1a_{}x{}'.format(*kernel_size)) + # 1 x 1 x 1024 + net = slim.dropout(net, keep_prob=dropout_keep_prob, scope='Dropout_1b') + logits = slim.conv2d(net, num_classes, [1, 1], activation_fn=None, + normalizer_fn=None, scope='Conv2d_1c_1x1') + if spatial_squeeze: + logits = tf.squeeze(logits, [1, 2], name='SpatialSqueeze') + end_points['Logits'] = logits + end_points['Predictions'] = prediction_fn(logits, scope='Predictions') + return logits, end_points +inception_v2.default_image_size = 224 + + +def _reduced_kernel_size_for_small_input(input_tensor, kernel_size): + """Define kernel size which is automatically reduced for small input. + + If the shape of the input images is unknown at graph construction time this + function assumes that the input images are is large enough. + + Args: + input_tensor: input tensor of size [batch_size, height, width, channels]. + kernel_size: desired kernel size of length 2: [kernel_height, kernel_width] + + Returns: + a tensor with the kernel size. + + TODO(jrru): Make this function work with unknown shapes. Theoretically, this + can be done with the code below. Problems are two-fold: (1) If the shape was + known, it will be lost. (2) inception.slim.ops._two_element_tuple cannot + handle tensors that define the kernel size. + shape = tf.shape(input_tensor) + return = tf.pack([tf.minimum(shape[1], kernel_size[0]), + tf.minimum(shape[2], kernel_size[1])]) + + """ + shape = input_tensor.get_shape().as_list() + if shape[1] is None or shape[2] is None: + kernel_size_out = kernel_size + else: + kernel_size_out = [min(shape[1], kernel_size[0]), + min(shape[2], kernel_size[1])] + return kernel_size_out + + +def inception_v2_arg_scope(weight_decay=0.00004): + """Defines the default InceptionV2 arg scope. + + Args: + weight_decay: The weight decay to use for regularizing the model. + + Returns: + An `arg_scope` to use for the inception v3 model. + """ + batch_norm_params = { + # Decay for the moving averages. + 'decay': 0.9997, + # epsilon to prevent 0s in variance. + 'epsilon': 0.001, + # collection containing update_ops. + 'updates_collections': tf.GraphKeys.UPDATE_OPS, + } + + # Set weight_decay for weights in Conv and FC layers. + with slim.arg_scope([slim.conv2d, slim.fully_connected], + weights_regularizer=slim.l2_regularizer(weight_decay)): + with slim.arg_scope( + [slim.conv2d], + weights_initializer=slim.variance_scaling_initializer(), + activation_fn=tf.nn.relu, + normalizer_fn=slim.batch_norm, + normalizer_params=batch_norm_params) as sc: + return sc diff --git a/nets/inception_v2_test.py b/nets/inception_v2_test.py new file mode 100644 index 0000000..0ff850c --- /dev/null +++ b/nets/inception_v2_test.py @@ -0,0 +1,262 @@ +# Copyright 2016 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for nets.inception_v2.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +import tensorflow as tf + +from nets import inception + +slim = tf.contrib.slim + + +class InceptionV2Test(tf.test.TestCase): + + def testBuildClassificationNetwork(self): + batch_size = 5 + height, width = 224, 224 + num_classes = 1000 + + inputs = tf.random_uniform((batch_size, height, width, 3)) + logits, end_points = inception.inception_v2(inputs, num_classes) + self.assertTrue(logits.op.name.startswith('InceptionV2/Logits')) + self.assertListEqual(logits.get_shape().as_list(), + [batch_size, num_classes]) + self.assertTrue('Predictions' in end_points) + self.assertListEqual(end_points['Predictions'].get_shape().as_list(), + [batch_size, num_classes]) + + def testBuildBaseNetwork(self): + batch_size = 5 + height, width = 224, 224 + + inputs = tf.random_uniform((batch_size, height, width, 3)) + mixed_5c, end_points = inception.inception_v2_base(inputs) + self.assertTrue(mixed_5c.op.name.startswith('InceptionV2/Mixed_5c')) + self.assertListEqual(mixed_5c.get_shape().as_list(), + [batch_size, 7, 7, 1024]) + expected_endpoints = ['Mixed_3b', 'Mixed_3c', 'Mixed_4a', 'Mixed_4b', + 'Mixed_4c', 'Mixed_4d', 'Mixed_4e', 'Mixed_5a', + 'Mixed_5b', 'Mixed_5c', 'Conv2d_1a_7x7', + 'MaxPool_2a_3x3', 'Conv2d_2b_1x1', 'Conv2d_2c_3x3', + 'MaxPool_3a_3x3'] + self.assertItemsEqual(end_points.keys(), expected_endpoints) + + def testBuildOnlyUptoFinalEndpoint(self): + batch_size = 5 + height, width = 224, 224 + endpoints = ['Conv2d_1a_7x7', 'MaxPool_2a_3x3', 'Conv2d_2b_1x1', + 'Conv2d_2c_3x3', 'MaxPool_3a_3x3', 'Mixed_3b', 'Mixed_3c', + 'Mixed_4a', 'Mixed_4b', 'Mixed_4c', 'Mixed_4d', 'Mixed_4e', + 'Mixed_5a', 'Mixed_5b', 'Mixed_5c'] + for index, endpoint in enumerate(endpoints): + with tf.Graph().as_default(): + inputs = tf.random_uniform((batch_size, height, width, 3)) + out_tensor, end_points = inception.inception_v2_base( + inputs, final_endpoint=endpoint) + self.assertTrue(out_tensor.op.name.startswith( + 'InceptionV2/' + endpoint)) + self.assertItemsEqual(endpoints[:index+1], end_points) + + def testBuildAndCheckAllEndPointsUptoMixed5c(self): + batch_size = 5 + height, width = 224, 224 + + inputs = tf.random_uniform((batch_size, height, width, 3)) + _, end_points = inception.inception_v2_base(inputs, + final_endpoint='Mixed_5c') + endpoints_shapes = {'Mixed_3b': [batch_size, 28, 28, 256], + 'Mixed_3c': [batch_size, 28, 28, 320], + 'Mixed_4a': [batch_size, 14, 14, 576], + 'Mixed_4b': [batch_size, 14, 14, 576], + 'Mixed_4c': [batch_size, 14, 14, 576], + 'Mixed_4d': [batch_size, 14, 14, 576], + 'Mixed_4e': [batch_size, 14, 14, 576], + 'Mixed_5a': [batch_size, 7, 7, 1024], + 'Mixed_5b': [batch_size, 7, 7, 1024], + 'Mixed_5c': [batch_size, 7, 7, 1024], + 'Conv2d_1a_7x7': [batch_size, 112, 112, 64], + 'MaxPool_2a_3x3': [batch_size, 56, 56, 64], + 'Conv2d_2b_1x1': [batch_size, 56, 56, 64], + 'Conv2d_2c_3x3': [batch_size, 56, 56, 192], + 'MaxPool_3a_3x3': [batch_size, 28, 28, 192]} + self.assertItemsEqual(endpoints_shapes.keys(), end_points.keys()) + for endpoint_name in endpoints_shapes: + expected_shape = endpoints_shapes[endpoint_name] + self.assertTrue(endpoint_name in end_points) + self.assertListEqual(end_points[endpoint_name].get_shape().as_list(), + expected_shape) + + def testModelHasExpectedNumberOfParameters(self): + batch_size = 5 + height, width = 224, 224 + inputs = tf.random_uniform((batch_size, height, width, 3)) + with slim.arg_scope(inception.inception_v2_arg_scope()): + inception.inception_v2_base(inputs) + total_params, _ = slim.model_analyzer.analyze_vars( + slim.get_model_variables()) + self.assertAlmostEqual(10173112, total_params) + + def testBuildEndPointsWithDepthMultiplierLessThanOne(self): + batch_size = 5 + height, width = 224, 224 + num_classes = 1000 + + inputs = tf.random_uniform((batch_size, height, width, 3)) + _, end_points = inception.inception_v2(inputs, num_classes) + + endpoint_keys = [key for key in end_points.keys() + if key.startswith('Mixed') or key.startswith('Conv')] + + _, end_points_with_multiplier = inception.inception_v2( + inputs, num_classes, scope='depth_multiplied_net', + depth_multiplier=0.5) + + for key in endpoint_keys: + original_depth = end_points[key].get_shape().as_list()[3] + new_depth = end_points_with_multiplier[key].get_shape().as_list()[3] + self.assertEqual(0.5 * original_depth, new_depth) + + def testBuildEndPointsWithDepthMultiplierGreaterThanOne(self): + batch_size = 5 + height, width = 224, 224 + num_classes = 1000 + + inputs = tf.random_uniform((batch_size, height, width, 3)) + _, end_points = inception.inception_v2(inputs, num_classes) + + endpoint_keys = [key for key in end_points.keys() + if key.startswith('Mixed') or key.startswith('Conv')] + + _, end_points_with_multiplier = inception.inception_v2( + inputs, num_classes, scope='depth_multiplied_net', + depth_multiplier=2.0) + + for key in endpoint_keys: + original_depth = end_points[key].get_shape().as_list()[3] + new_depth = end_points_with_multiplier[key].get_shape().as_list()[3] + self.assertEqual(2.0 * original_depth, new_depth) + + def testRaiseValueErrorWithInvalidDepthMultiplier(self): + batch_size = 5 + height, width = 224, 224 + num_classes = 1000 + + inputs = tf.random_uniform((batch_size, height, width, 3)) + with self.assertRaises(ValueError): + _ = inception.inception_v2(inputs, num_classes, depth_multiplier=-0.1) + with self.assertRaises(ValueError): + _ = inception.inception_v2(inputs, num_classes, depth_multiplier=0.0) + + def testHalfSizeImages(self): + batch_size = 5 + height, width = 112, 112 + num_classes = 1000 + + inputs = tf.random_uniform((batch_size, height, width, 3)) + logits, end_points = inception.inception_v2(inputs, num_classes) + self.assertTrue(logits.op.name.startswith('InceptionV2/Logits')) + self.assertListEqual(logits.get_shape().as_list(), + [batch_size, num_classes]) + pre_pool = end_points['Mixed_5c'] + self.assertListEqual(pre_pool.get_shape().as_list(), + [batch_size, 4, 4, 1024]) + + def testUnknownImageShape(self): + tf.reset_default_graph() + batch_size = 2 + height, width = 224, 224 + num_classes = 1000 + input_np = np.random.uniform(0, 1, (batch_size, height, width, 3)) + with self.test_session() as sess: + inputs = tf.placeholder(tf.float32, shape=(batch_size, None, None, 3)) + logits, end_points = inception.inception_v2(inputs, num_classes) + self.assertTrue(logits.op.name.startswith('InceptionV2/Logits')) + self.assertListEqual(logits.get_shape().as_list(), + [batch_size, num_classes]) + pre_pool = end_points['Mixed_5c'] + feed_dict = {inputs: input_np} + tf.initialize_all_variables().run() + pre_pool_out = sess.run(pre_pool, feed_dict=feed_dict) + self.assertListEqual(list(pre_pool_out.shape), [batch_size, 7, 7, 1024]) + + def testUnknowBatchSize(self): + batch_size = 1 + height, width = 224, 224 + num_classes = 1000 + + inputs = tf.placeholder(tf.float32, (None, height, width, 3)) + logits, _ = inception.inception_v2(inputs, num_classes) + self.assertTrue(logits.op.name.startswith('InceptionV2/Logits')) + self.assertListEqual(logits.get_shape().as_list(), + [None, num_classes]) + images = tf.random_uniform((batch_size, height, width, 3)) + + with self.test_session() as sess: + sess.run(tf.initialize_all_variables()) + output = sess.run(logits, {inputs: images.eval()}) + self.assertEquals(output.shape, (batch_size, num_classes)) + + def testEvaluation(self): + batch_size = 2 + height, width = 224, 224 + num_classes = 1000 + + eval_inputs = tf.random_uniform((batch_size, height, width, 3)) + logits, _ = inception.inception_v2(eval_inputs, num_classes, + is_training=False) + predictions = tf.argmax(logits, 1) + + with self.test_session() as sess: + sess.run(tf.initialize_all_variables()) + output = sess.run(predictions) + self.assertEquals(output.shape, (batch_size,)) + + def testTrainEvalWithReuse(self): + train_batch_size = 5 + eval_batch_size = 2 + height, width = 150, 150 + num_classes = 1000 + + train_inputs = tf.random_uniform((train_batch_size, height, width, 3)) + inception.inception_v2(train_inputs, num_classes) + eval_inputs = tf.random_uniform((eval_batch_size, height, width, 3)) + logits, _ = inception.inception_v2(eval_inputs, num_classes, reuse=True) + predictions = tf.argmax(logits, 1) + + with self.test_session() as sess: + sess.run(tf.initialize_all_variables()) + output = sess.run(predictions) + self.assertEquals(output.shape, (eval_batch_size,)) + + def testLogitsNotSqueezed(self): + num_classes = 25 + images = tf.random_uniform([1, 224, 224, 3]) + logits, _ = inception.inception_v2(images, + num_classes=num_classes, + spatial_squeeze=False) + + with self.test_session() as sess: + tf.initialize_all_variables().run() + logits_out = sess.run(logits) + self.assertListEqual(list(logits_out.shape), [1, 1, 1, num_classes]) + + +if __name__ == '__main__': + tf.test.main() diff --git a/nets/inception_v3.py b/nets/inception_v3.py new file mode 100644 index 0000000..d5a1fe3 --- /dev/null +++ b/nets/inception_v3.py @@ -0,0 +1,587 @@ +# Copyright 2016 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Contains the definition for inception v3 classification network.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import tensorflow as tf + +slim = tf.contrib.slim +trunc_normal = lambda stddev: tf.truncated_normal_initializer(0.0, stddev) + + +def inception_v3_base(inputs, + final_endpoint='Mixed_7c', + min_depth=16, + depth_multiplier=1.0, + scope=None): + """Inception model from http://arxiv.org/abs/1512.00567. + + Constructs an Inception v3 network from inputs to the given final endpoint. + This method can construct the network up to the final inception block + Mixed_7c. + + Note that the names of the layers in the paper do not correspond to the names + of the endpoints registered by this function although they build the same + network. + + Here is a mapping from the old_names to the new names: + Old name | New name + ======================================= + conv0 | Conv2d_1a_3x3 + conv1 | Conv2d_2a_3x3 + conv2 | Conv2d_2b_3x3 + pool1 | MaxPool_3a_3x3 + conv3 | Conv2d_3b_1x1 + conv4 | Conv2d_4a_3x3 + pool2 | MaxPool_5a_3x3 + mixed_35x35x256a | Mixed_5b + mixed_35x35x288a | Mixed_5c + mixed_35x35x288b | Mixed_5d + mixed_17x17x768a | Mixed_6a + mixed_17x17x768b | Mixed_6b + mixed_17x17x768c | Mixed_6c + mixed_17x17x768d | Mixed_6d + mixed_17x17x768e | Mixed_6e + mixed_8x8x1280a | Mixed_7a + mixed_8x8x2048a | Mixed_7b + mixed_8x8x2048b | Mixed_7c + + Args: + inputs: a tensor of size [batch_size, height, width, channels]. + final_endpoint: specifies the endpoint to construct the network up to. It + can be one of ['Conv2d_1a_3x3', 'Conv2d_2a_3x3', 'Conv2d_2b_3x3', + 'MaxPool_3a_3x3', 'Conv2d_3b_1x1', 'Conv2d_4a_3x3', 'MaxPool_5a_3x3', + 'Mixed_5b', 'Mixed_5c', 'Mixed_5d', 'Mixed_6a', 'Mixed_6b', 'Mixed_6c', + 'Mixed_6d', 'Mixed_6e', 'Mixed_7a', 'Mixed_7b', 'Mixed_7c']. + min_depth: Minimum depth value (number of channels) for all convolution ops. + Enforced when depth_multiplier < 1, and not an active constraint when + depth_multiplier >= 1. + depth_multiplier: Float multiplier for the depth (number of channels) + for all convolution ops. The value must be greater than zero. Typical + usage will be to set this value in (0, 1) to reduce the number of + parameters or computation cost of the model. + scope: Optional variable_scope. + + Returns: + tensor_out: output tensor corresponding to the final_endpoint. + end_points: a set of activations for external use, for example summaries or + losses. + + Raises: + ValueError: if final_endpoint is not set to one of the predefined values, + or depth_multiplier <= 0 + """ + # end_points will collect relevant activations for external use, for example + # summaries or losses. + end_points = {} + + if depth_multiplier <= 0: + raise ValueError('depth_multiplier is not greater than zero.') + depth = lambda d: max(int(d * depth_multiplier), min_depth) + + with tf.variable_scope(scope, 'InceptionV3', [inputs]): + with slim.arg_scope([slim.conv2d, slim.max_pool2d, slim.avg_pool2d], + stride=1, padding='VALID'): + # 299 x 299 x 3 + end_point = 'Conv2d_1a_3x3' + net = slim.conv2d(inputs, depth(32), [3, 3], stride=2, scope=end_point) + end_points[end_point] = net + if end_point == final_endpoint: return net, end_points + # 149 x 149 x 32 + end_point = 'Conv2d_2a_3x3' + net = slim.conv2d(net, depth(32), [3, 3], scope=end_point) + end_points[end_point] = net + if end_point == final_endpoint: return net, end_points + # 147 x 147 x 32 + end_point = 'Conv2d_2b_3x3' + net = slim.conv2d(net, depth(64), [3, 3], padding='SAME', scope=end_point) + end_points[end_point] = net + if end_point == final_endpoint: return net, end_points + # 147 x 147 x 64 + end_point = 'MaxPool_3a_3x3' + net = slim.max_pool2d(net, [3, 3], stride=2, scope=end_point) + end_points[end_point] = net + if end_point == final_endpoint: return net, end_points + # 73 x 73 x 64 + end_point = 'Conv2d_3b_1x1' + net = slim.conv2d(net, depth(80), [1, 1], scope=end_point) + end_points[end_point] = net + if end_point == final_endpoint: return net, end_points + # 73 x 73 x 80. + end_point = 'Conv2d_4a_3x3' + net = slim.conv2d(net, depth(192), [3, 3], scope=end_point) + end_points[end_point] = net + if end_point == final_endpoint: return net, end_points + # 71 x 71 x 192. + end_point = 'MaxPool_5a_3x3' + net = slim.max_pool2d(net, [3, 3], stride=2, scope=end_point) + end_points[end_point] = net + if end_point == final_endpoint: return net, end_points + # 35 x 35 x 192. + + # Inception blocks + with slim.arg_scope([slim.conv2d, slim.max_pool2d, slim.avg_pool2d], + stride=1, padding='SAME'): + # mixed: 35 x 35 x 256. + end_point = 'Mixed_5b' + with tf.variable_scope(end_point): + with tf.variable_scope('Branch_0'): + branch_0 = slim.conv2d(net, depth(64), [1, 1], scope='Conv2d_0a_1x1') + with tf.variable_scope('Branch_1'): + branch_1 = slim.conv2d(net, depth(48), [1, 1], scope='Conv2d_0a_1x1') + branch_1 = slim.conv2d(branch_1, depth(64), [5, 5], + scope='Conv2d_0b_5x5') + with tf.variable_scope('Branch_2'): + branch_2 = slim.conv2d(net, depth(64), [1, 1], scope='Conv2d_0a_1x1') + branch_2 = slim.conv2d(branch_2, depth(96), [3, 3], + scope='Conv2d_0b_3x3') + branch_2 = slim.conv2d(branch_2, depth(96), [3, 3], + scope='Conv2d_0c_3x3') + with tf.variable_scope('Branch_3'): + branch_3 = slim.avg_pool2d(net, [3, 3], scope='AvgPool_0a_3x3') + branch_3 = slim.conv2d(branch_3, depth(32), [1, 1], + scope='Conv2d_0b_1x1') + net = tf.concat(3, [branch_0, branch_1, branch_2, branch_3]) + end_points[end_point] = net + if end_point == final_endpoint: return net, end_points + + # mixed_1: 35 x 35 x 288. + end_point = 'Mixed_5c' + with tf.variable_scope(end_point): + with tf.variable_scope('Branch_0'): + branch_0 = slim.conv2d(net, depth(64), [1, 1], scope='Conv2d_0a_1x1') + with tf.variable_scope('Branch_1'): + branch_1 = slim.conv2d(net, depth(48), [1, 1], scope='Conv2d_0b_1x1') + branch_1 = slim.conv2d(branch_1, depth(64), [5, 5], + scope='Conv_1_0c_5x5') + with tf.variable_scope('Branch_2'): + branch_2 = slim.conv2d(net, depth(64), [1, 1], + scope='Conv2d_0a_1x1') + branch_2 = slim.conv2d(branch_2, depth(96), [3, 3], + scope='Conv2d_0b_3x3') + branch_2 = slim.conv2d(branch_2, depth(96), [3, 3], + scope='Conv2d_0c_3x3') + with tf.variable_scope('Branch_3'): + branch_3 = slim.avg_pool2d(net, [3, 3], scope='AvgPool_0a_3x3') + branch_3 = slim.conv2d(branch_3, depth(64), [1, 1], + scope='Conv2d_0b_1x1') + net = tf.concat(3, [branch_0, branch_1, branch_2, branch_3]) + end_points[end_point] = net + if end_point == final_endpoint: return net, end_points + + # mixed_2: 35 x 35 x 288. + end_point = 'Mixed_5d' + with tf.variable_scope(end_point): + with tf.variable_scope('Branch_0'): + branch_0 = slim.conv2d(net, depth(64), [1, 1], scope='Conv2d_0a_1x1') + with tf.variable_scope('Branch_1'): + branch_1 = slim.conv2d(net, depth(48), [1, 1], scope='Conv2d_0a_1x1') + branch_1 = slim.conv2d(branch_1, depth(64), [5, 5], + scope='Conv2d_0b_5x5') + with tf.variable_scope('Branch_2'): + branch_2 = slim.conv2d(net, depth(64), [1, 1], scope='Conv2d_0a_1x1') + branch_2 = slim.conv2d(branch_2, depth(96), [3, 3], + scope='Conv2d_0b_3x3') + branch_2 = slim.conv2d(branch_2, depth(96), [3, 3], + scope='Conv2d_0c_3x3') + with tf.variable_scope('Branch_3'): + branch_3 = slim.avg_pool2d(net, [3, 3], scope='AvgPool_0a_3x3') + branch_3 = slim.conv2d(branch_3, depth(64), [1, 1], + scope='Conv2d_0b_1x1') + net = tf.concat(3, [branch_0, branch_1, branch_2, branch_3]) + end_points[end_point] = net + if end_point == final_endpoint: return net, end_points + + # mixed_3: 17 x 17 x 768. + end_point = 'Mixed_6a' + with tf.variable_scope(end_point): + with tf.variable_scope('Branch_0'): + branch_0 = slim.conv2d(net, depth(384), [3, 3], stride=2, + padding='VALID', scope='Conv2d_1a_1x1') + with tf.variable_scope('Branch_1'): + branch_1 = slim.conv2d(net, depth(64), [1, 1], scope='Conv2d_0a_1x1') + branch_1 = slim.conv2d(branch_1, depth(96), [3, 3], + scope='Conv2d_0b_3x3') + branch_1 = slim.conv2d(branch_1, depth(96), [3, 3], stride=2, + padding='VALID', scope='Conv2d_1a_1x1') + with tf.variable_scope('Branch_2'): + branch_2 = slim.max_pool2d(net, [3, 3], stride=2, padding='VALID', + scope='MaxPool_1a_3x3') + net = tf.concat(3, [branch_0, branch_1, branch_2]) + end_points[end_point] = net + if end_point == final_endpoint: return net, end_points + + # mixed4: 17 x 17 x 768. + end_point = 'Mixed_6b' + with tf.variable_scope(end_point): + with tf.variable_scope('Branch_0'): + branch_0 = slim.conv2d(net, depth(192), [1, 1], scope='Conv2d_0a_1x1') + with tf.variable_scope('Branch_1'): + branch_1 = slim.conv2d(net, depth(128), [1, 1], scope='Conv2d_0a_1x1') + branch_1 = slim.conv2d(branch_1, depth(128), [1, 7], + scope='Conv2d_0b_1x7') + branch_1 = slim.conv2d(branch_1, depth(192), [7, 1], + scope='Conv2d_0c_7x1') + with tf.variable_scope('Branch_2'): + branch_2 = slim.conv2d(net, depth(128), [1, 1], scope='Conv2d_0a_1x1') + branch_2 = slim.conv2d(branch_2, depth(128), [7, 1], + scope='Conv2d_0b_7x1') + branch_2 = slim.conv2d(branch_2, depth(128), [1, 7], + scope='Conv2d_0c_1x7') + branch_2 = slim.conv2d(branch_2, depth(128), [7, 1], + scope='Conv2d_0d_7x1') + branch_2 = slim.conv2d(branch_2, depth(192), [1, 7], + scope='Conv2d_0e_1x7') + with tf.variable_scope('Branch_3'): + branch_3 = slim.avg_pool2d(net, [3, 3], scope='AvgPool_0a_3x3') + branch_3 = slim.conv2d(branch_3, depth(192), [1, 1], + scope='Conv2d_0b_1x1') + net = tf.concat(3, [branch_0, branch_1, branch_2, branch_3]) + end_points[end_point] = net + if end_point == final_endpoint: return net, end_points + + # mixed_5: 17 x 17 x 768. + end_point = 'Mixed_6c' + with tf.variable_scope(end_point): + with tf.variable_scope('Branch_0'): + branch_0 = slim.conv2d(net, depth(192), [1, 1], scope='Conv2d_0a_1x1') + with tf.variable_scope('Branch_1'): + branch_1 = slim.conv2d(net, depth(160), [1, 1], scope='Conv2d_0a_1x1') + branch_1 = slim.conv2d(branch_1, depth(160), [1, 7], + scope='Conv2d_0b_1x7') + branch_1 = slim.conv2d(branch_1, depth(192), [7, 1], + scope='Conv2d_0c_7x1') + with tf.variable_scope('Branch_2'): + branch_2 = slim.conv2d(net, depth(160), [1, 1], scope='Conv2d_0a_1x1') + branch_2 = slim.conv2d(branch_2, depth(160), [7, 1], + scope='Conv2d_0b_7x1') + branch_2 = slim.conv2d(branch_2, depth(160), [1, 7], + scope='Conv2d_0c_1x7') + branch_2 = slim.conv2d(branch_2, depth(160), [7, 1], + scope='Conv2d_0d_7x1') + branch_2 = slim.conv2d(branch_2, depth(192), [1, 7], + scope='Conv2d_0e_1x7') + with tf.variable_scope('Branch_3'): + branch_3 = slim.avg_pool2d(net, [3, 3], scope='AvgPool_0a_3x3') + branch_3 = slim.conv2d(branch_3, depth(192), [1, 1], + scope='Conv2d_0b_1x1') + net = tf.concat(3, [branch_0, branch_1, branch_2, branch_3]) + end_points[end_point] = net + if end_point == final_endpoint: return net, end_points + # mixed_6: 17 x 17 x 768. + end_point = 'Mixed_6d' + with tf.variable_scope(end_point): + with tf.variable_scope('Branch_0'): + branch_0 = slim.conv2d(net, depth(192), [1, 1], scope='Conv2d_0a_1x1') + with tf.variable_scope('Branch_1'): + branch_1 = slim.conv2d(net, depth(160), [1, 1], scope='Conv2d_0a_1x1') + branch_1 = slim.conv2d(branch_1, depth(160), [1, 7], + scope='Conv2d_0b_1x7') + branch_1 = slim.conv2d(branch_1, depth(192), [7, 1], + scope='Conv2d_0c_7x1') + with tf.variable_scope('Branch_2'): + branch_2 = slim.conv2d(net, depth(160), [1, 1], scope='Conv2d_0a_1x1') + branch_2 = slim.conv2d(branch_2, depth(160), [7, 1], + scope='Conv2d_0b_7x1') + branch_2 = slim.conv2d(branch_2, depth(160), [1, 7], + scope='Conv2d_0c_1x7') + branch_2 = slim.conv2d(branch_2, depth(160), [7, 1], + scope='Conv2d_0d_7x1') + branch_2 = slim.conv2d(branch_2, depth(192), [1, 7], + scope='Conv2d_0e_1x7') + with tf.variable_scope('Branch_3'): + branch_3 = slim.avg_pool2d(net, [3, 3], scope='AvgPool_0a_3x3') + branch_3 = slim.conv2d(branch_3, depth(192), [1, 1], + scope='Conv2d_0b_1x1') + net = tf.concat(3, [branch_0, branch_1, branch_2, branch_3]) + end_points[end_point] = net + if end_point == final_endpoint: return net, end_points + + # mixed_7: 17 x 17 x 768. + end_point = 'Mixed_6e' + with tf.variable_scope(end_point): + with tf.variable_scope('Branch_0'): + branch_0 = slim.conv2d(net, depth(192), [1, 1], scope='Conv2d_0a_1x1') + with tf.variable_scope('Branch_1'): + branch_1 = slim.conv2d(net, depth(192), [1, 1], scope='Conv2d_0a_1x1') + branch_1 = slim.conv2d(branch_1, depth(192), [1, 7], + scope='Conv2d_0b_1x7') + branch_1 = slim.conv2d(branch_1, depth(192), [7, 1], + scope='Conv2d_0c_7x1') + with tf.variable_scope('Branch_2'): + branch_2 = slim.conv2d(net, depth(192), [1, 1], scope='Conv2d_0a_1x1') + branch_2 = slim.conv2d(branch_2, depth(192), [7, 1], + scope='Conv2d_0b_7x1') + branch_2 = slim.conv2d(branch_2, depth(192), [1, 7], + scope='Conv2d_0c_1x7') + branch_2 = slim.conv2d(branch_2, depth(192), [7, 1], + scope='Conv2d_0d_7x1') + branch_2 = slim.conv2d(branch_2, depth(192), [1, 7], + scope='Conv2d_0e_1x7') + with tf.variable_scope('Branch_3'): + branch_3 = slim.avg_pool2d(net, [3, 3], scope='AvgPool_0a_3x3') + branch_3 = slim.conv2d(branch_3, depth(192), [1, 1], + scope='Conv2d_0b_1x1') + net = tf.concat(3, [branch_0, branch_1, branch_2, branch_3]) + end_points[end_point] = net + if end_point == final_endpoint: return net, end_points + + # mixed_8: 8 x 8 x 1280. + end_point = 'Mixed_7a' + with tf.variable_scope(end_point): + with tf.variable_scope('Branch_0'): + branch_0 = slim.conv2d(net, depth(192), [1, 1], scope='Conv2d_0a_1x1') + branch_0 = slim.conv2d(branch_0, depth(320), [3, 3], stride=2, + padding='VALID', scope='Conv2d_1a_3x3') + with tf.variable_scope('Branch_1'): + branch_1 = slim.conv2d(net, depth(192), [1, 1], scope='Conv2d_0a_1x1') + branch_1 = slim.conv2d(branch_1, depth(192), [1, 7], + scope='Conv2d_0b_1x7') + branch_1 = slim.conv2d(branch_1, depth(192), [7, 1], + scope='Conv2d_0c_7x1') + branch_1 = slim.conv2d(branch_1, depth(192), [3, 3], stride=2, + padding='VALID', scope='Conv2d_1a_3x3') + with tf.variable_scope('Branch_2'): + branch_2 = slim.max_pool2d(net, [3, 3], stride=2, padding='VALID', + scope='MaxPool_1a_3x3') + net = tf.concat(3, [branch_0, branch_1, branch_2]) + end_points[end_point] = net + if end_point == final_endpoint: return net, end_points + # mixed_9: 8 x 8 x 2048. + end_point = 'Mixed_7b' + with tf.variable_scope(end_point): + with tf.variable_scope('Branch_0'): + branch_0 = slim.conv2d(net, depth(320), [1, 1], scope='Conv2d_0a_1x1') + with tf.variable_scope('Branch_1'): + branch_1 = slim.conv2d(net, depth(384), [1, 1], scope='Conv2d_0a_1x1') + branch_1 = tf.concat(3, [ + slim.conv2d(branch_1, depth(384), [1, 3], scope='Conv2d_0b_1x3'), + slim.conv2d(branch_1, depth(384), [3, 1], scope='Conv2d_0b_3x1')]) + with tf.variable_scope('Branch_2'): + branch_2 = slim.conv2d(net, depth(448), [1, 1], scope='Conv2d_0a_1x1') + branch_2 = slim.conv2d( + branch_2, depth(384), [3, 3], scope='Conv2d_0b_3x3') + branch_2 = tf.concat(3, [ + slim.conv2d(branch_2, depth(384), [1, 3], scope='Conv2d_0c_1x3'), + slim.conv2d(branch_2, depth(384), [3, 1], scope='Conv2d_0d_3x1')]) + with tf.variable_scope('Branch_3'): + branch_3 = slim.avg_pool2d(net, [3, 3], scope='AvgPool_0a_3x3') + branch_3 = slim.conv2d( + branch_3, depth(192), [1, 1], scope='Conv2d_0b_1x1') + net = tf.concat(3, [branch_0, branch_1, branch_2, branch_3]) + end_points[end_point] = net + if end_point == final_endpoint: return net, end_points + + # mixed_10: 8 x 8 x 2048. + end_point = 'Mixed_7c' + with tf.variable_scope(end_point): + with tf.variable_scope('Branch_0'): + branch_0 = slim.conv2d(net, depth(320), [1, 1], scope='Conv2d_0a_1x1') + with tf.variable_scope('Branch_1'): + branch_1 = slim.conv2d(net, depth(384), [1, 1], scope='Conv2d_0a_1x1') + branch_1 = tf.concat(3, [ + slim.conv2d(branch_1, depth(384), [1, 3], scope='Conv2d_0b_1x3'), + slim.conv2d(branch_1, depth(384), [3, 1], scope='Conv2d_0c_3x1')]) + with tf.variable_scope('Branch_2'): + branch_2 = slim.conv2d(net, depth(448), [1, 1], scope='Conv2d_0a_1x1') + branch_2 = slim.conv2d( + branch_2, depth(384), [3, 3], scope='Conv2d_0b_3x3') + branch_2 = tf.concat(3, [ + slim.conv2d(branch_2, depth(384), [1, 3], scope='Conv2d_0c_1x3'), + slim.conv2d(branch_2, depth(384), [3, 1], scope='Conv2d_0d_3x1')]) + with tf.variable_scope('Branch_3'): + branch_3 = slim.avg_pool2d(net, [3, 3], scope='AvgPool_0a_3x3') + branch_3 = slim.conv2d( + branch_3, depth(192), [1, 1], scope='Conv2d_0b_1x1') + net = tf.concat(3, [branch_0, branch_1, branch_2, branch_3]) + end_points[end_point] = net + if end_point == final_endpoint: return net, end_points + raise ValueError('Unknown final endpoint %s' % final_endpoint) + + +def inception_v3(inputs, + num_classes=1000, + is_training=True, + dropout_keep_prob=0.8, + min_depth=16, + depth_multiplier=1.0, + prediction_fn=slim.softmax, + spatial_squeeze=True, + reuse=None, + scope='InceptionV3'): + """Inception model from http://arxiv.org/abs/1512.00567. + + "Rethinking the Inception Architecture for Computer Vision" + + Christian Szegedy, Vincent Vanhoucke, Sergey Ioffe, Jonathon Shlens, + Zbigniew Wojna. + + With the default arguments this method constructs the exact model defined in + the paper. However, one can experiment with variations of the inception_v3 + network by changing arguments dropout_keep_prob, min_depth and + depth_multiplier. + + The default image size used to train this network is 299x299. + + Args: + inputs: a tensor of size [batch_size, height, width, channels]. + num_classes: number of predicted classes. + is_training: whether is training or not. + dropout_keep_prob: the percentage of activation values that are retained. + min_depth: Minimum depth value (number of channels) for all convolution ops. + Enforced when depth_multiplier < 1, and not an active constraint when + depth_multiplier >= 1. + depth_multiplier: Float multiplier for the depth (number of channels) + for all convolution ops. The value must be greater than zero. Typical + usage will be to set this value in (0, 1) to reduce the number of + parameters or computation cost of the model. + prediction_fn: a function to get predictions out of logits. + spatial_squeeze: if True, logits is of shape is [B, C], if false logits is + of shape [B, 1, 1, C], where B is batch_size and C is number of classes. + reuse: whether or not the network and its variables should be reused. To be + able to reuse 'scope' must be given. + scope: Optional variable_scope. + + Returns: + logits: the pre-softmax activations, a tensor of size + [batch_size, num_classes] + end_points: a dictionary from components of the network to the corresponding + activation. + + Raises: + ValueError: if 'depth_multiplier' is less than or equal to zero. + """ + if depth_multiplier <= 0: + raise ValueError('depth_multiplier is not greater than zero.') + depth = lambda d: max(int(d * depth_multiplier), min_depth) + + with tf.variable_scope(scope, 'InceptionV3', [inputs, num_classes], + reuse=reuse) as scope: + with slim.arg_scope([slim.batch_norm, slim.dropout], + is_training=is_training): + net, end_points = inception_v3_base( + inputs, scope=scope, min_depth=min_depth, + depth_multiplier=depth_multiplier) + + # Auxiliary Head logits + with slim.arg_scope([slim.conv2d, slim.max_pool2d, slim.avg_pool2d], + stride=1, padding='SAME'): + aux_logits = end_points['Mixed_6e'] + with tf.variable_scope('AuxLogits'): + aux_logits = slim.avg_pool2d( + aux_logits, [5, 5], stride=3, padding='VALID', + scope='AvgPool_1a_5x5') + aux_logits = slim.conv2d(aux_logits, depth(128), [1, 1], + scope='Conv2d_1b_1x1') + + # Shape of feature map before the final layer. + kernel_size = _reduced_kernel_size_for_small_input( + aux_logits, [5, 5]) + aux_logits = slim.conv2d( + aux_logits, depth(768), kernel_size, + weights_initializer=trunc_normal(0.01), + padding='VALID', scope='Conv2d_2a_{}x{}'.format(*kernel_size)) + aux_logits = slim.conv2d( + aux_logits, num_classes, [1, 1], activation_fn=None, + normalizer_fn=None, weights_initializer=trunc_normal(0.001), + scope='Conv2d_2b_1x1') + if spatial_squeeze: + aux_logits = tf.squeeze(aux_logits, [1, 2], name='SpatialSqueeze') + end_points['AuxLogits'] = aux_logits + + # Final pooling and prediction + with tf.variable_scope('Logits'): + kernel_size = _reduced_kernel_size_for_small_input(net, [8, 8]) + net = slim.avg_pool2d(net, kernel_size, padding='VALID', + scope='AvgPool_1a_{}x{}'.format(*kernel_size)) + # 1 x 1 x 2048 + net = slim.dropout(net, keep_prob=dropout_keep_prob, scope='Dropout_1b') + end_points['PreLogits'] = net + # 2048 + logits = slim.conv2d(net, num_classes, [1, 1], activation_fn=None, + normalizer_fn=None, scope='Conv2d_1c_1x1') + if spatial_squeeze: + logits = tf.squeeze(logits, [1, 2], name='SpatialSqueeze') + # 1000 + end_points['Logits'] = logits + end_points['Predictions'] = prediction_fn(logits, scope='Predictions') + return logits, end_points +inception_v3.default_image_size = 299 + + +def _reduced_kernel_size_for_small_input(input_tensor, kernel_size): + """Define kernel size which is automatically reduced for small input. + + If the shape of the input images is unknown at graph construction time this + function assumes that the input images are is large enough. + + Args: + input_tensor: input tensor of size [batch_size, height, width, channels]. + kernel_size: desired kernel size of length 2: [kernel_height, kernel_width] + + Returns: + a tensor with the kernel size. + + TODO(jrru): Make this function work with unknown shapes. Theoretically, this + can be done with the code below. Problems are two-fold: (1) If the shape was + known, it will be lost. (2) inception.slim.ops._two_element_tuple cannot + handle tensors that define the kernel size. + shape = tf.shape(input_tensor) + return = tf.pack([tf.minimum(shape[1], kernel_size[0]), + tf.minimum(shape[2], kernel_size[1])]) + + """ + shape = input_tensor.get_shape().as_list() + if shape[1] is None or shape[2] is None: + kernel_size_out = kernel_size + else: + kernel_size_out = [min(shape[1], kernel_size[0]), + min(shape[2], kernel_size[1])] + return kernel_size_out + + +def inception_v3_arg_scope(weight_decay=0.00004, + stddev=0.1): + """Defines the default InceptionV3 arg scope. + + Args: + weight_decay: The weight decay to use for regularizing the model. + stddev: The standard deviation of the trunctated normal weight initializer. + + Returns: + An `arg_scope` to use for the inception v3 model. + """ + batch_norm_params = { + # Decay for the moving averages. + 'decay': 0.9997, + # epsilon to prevent 0s in variance. + 'epsilon': 0.001, + # collection containing update_ops. + 'updates_collections': tf.GraphKeys.UPDATE_OPS, + } + + # Set weight_decay for weights in Conv and FC layers. + with slim.arg_scope([slim.conv2d, slim.fully_connected], + weights_regularizer=slim.l2_regularizer(weight_decay)): + with slim.arg_scope( + [slim.conv2d], + weights_initializer=tf.truncated_normal_initializer(stddev=stddev), + activation_fn=tf.nn.relu, + normalizer_fn=slim.batch_norm, + normalizer_params=batch_norm_params) as sc: + return sc diff --git a/nets/inception_v3_test.py b/nets/inception_v3_test.py new file mode 100644 index 0000000..c82b265 --- /dev/null +++ b/nets/inception_v3_test.py @@ -0,0 +1,292 @@ +# Copyright 2016 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for nets.inception_v1.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +import tensorflow as tf + +from nets import inception + +slim = tf.contrib.slim + + +class InceptionV3Test(tf.test.TestCase): + + def testBuildClassificationNetwork(self): + batch_size = 5 + height, width = 299, 299 + num_classes = 1000 + + inputs = tf.random_uniform((batch_size, height, width, 3)) + logits, end_points = inception.inception_v3(inputs, num_classes) + self.assertTrue(logits.op.name.startswith('InceptionV3/Logits')) + self.assertListEqual(logits.get_shape().as_list(), + [batch_size, num_classes]) + self.assertTrue('Predictions' in end_points) + self.assertListEqual(end_points['Predictions'].get_shape().as_list(), + [batch_size, num_classes]) + + def testBuildBaseNetwork(self): + batch_size = 5 + height, width = 299, 299 + + inputs = tf.random_uniform((batch_size, height, width, 3)) + final_endpoint, end_points = inception.inception_v3_base(inputs) + self.assertTrue(final_endpoint.op.name.startswith( + 'InceptionV3/Mixed_7c')) + self.assertListEqual(final_endpoint.get_shape().as_list(), + [batch_size, 8, 8, 2048]) + expected_endpoints = ['Conv2d_1a_3x3', 'Conv2d_2a_3x3', 'Conv2d_2b_3x3', + 'MaxPool_3a_3x3', 'Conv2d_3b_1x1', 'Conv2d_4a_3x3', + 'MaxPool_5a_3x3', 'Mixed_5b', 'Mixed_5c', 'Mixed_5d', + 'Mixed_6a', 'Mixed_6b', 'Mixed_6c', 'Mixed_6d', + 'Mixed_6e', 'Mixed_7a', 'Mixed_7b', 'Mixed_7c'] + self.assertItemsEqual(end_points.keys(), expected_endpoints) + + def testBuildOnlyUptoFinalEndpoint(self): + batch_size = 5 + height, width = 299, 299 + endpoints = ['Conv2d_1a_3x3', 'Conv2d_2a_3x3', 'Conv2d_2b_3x3', + 'MaxPool_3a_3x3', 'Conv2d_3b_1x1', 'Conv2d_4a_3x3', + 'MaxPool_5a_3x3', 'Mixed_5b', 'Mixed_5c', 'Mixed_5d', + 'Mixed_6a', 'Mixed_6b', 'Mixed_6c', 'Mixed_6d', + 'Mixed_6e', 'Mixed_7a', 'Mixed_7b', 'Mixed_7c'] + + for index, endpoint in enumerate(endpoints): + with tf.Graph().as_default(): + inputs = tf.random_uniform((batch_size, height, width, 3)) + out_tensor, end_points = inception.inception_v3_base( + inputs, final_endpoint=endpoint) + self.assertTrue(out_tensor.op.name.startswith( + 'InceptionV3/' + endpoint)) + self.assertItemsEqual(endpoints[:index+1], end_points) + + def testBuildAndCheckAllEndPointsUptoMixed7c(self): + batch_size = 5 + height, width = 299, 299 + + inputs = tf.random_uniform((batch_size, height, width, 3)) + _, end_points = inception.inception_v3_base( + inputs, final_endpoint='Mixed_7c') + endpoints_shapes = {'Conv2d_1a_3x3': [batch_size, 149, 149, 32], + 'Conv2d_2a_3x3': [batch_size, 147, 147, 32], + 'Conv2d_2b_3x3': [batch_size, 147, 147, 64], + 'MaxPool_3a_3x3': [batch_size, 73, 73, 64], + 'Conv2d_3b_1x1': [batch_size, 73, 73, 80], + 'Conv2d_4a_3x3': [batch_size, 71, 71, 192], + 'MaxPool_5a_3x3': [batch_size, 35, 35, 192], + 'Mixed_5b': [batch_size, 35, 35, 256], + 'Mixed_5c': [batch_size, 35, 35, 288], + 'Mixed_5d': [batch_size, 35, 35, 288], + 'Mixed_6a': [batch_size, 17, 17, 768], + 'Mixed_6b': [batch_size, 17, 17, 768], + 'Mixed_6c': [batch_size, 17, 17, 768], + 'Mixed_6d': [batch_size, 17, 17, 768], + 'Mixed_6e': [batch_size, 17, 17, 768], + 'Mixed_7a': [batch_size, 8, 8, 1280], + 'Mixed_7b': [batch_size, 8, 8, 2048], + 'Mixed_7c': [batch_size, 8, 8, 2048]} + self.assertItemsEqual(endpoints_shapes.keys(), end_points.keys()) + for endpoint_name in endpoints_shapes: + expected_shape = endpoints_shapes[endpoint_name] + self.assertTrue(endpoint_name in end_points) + self.assertListEqual(end_points[endpoint_name].get_shape().as_list(), + expected_shape) + + def testModelHasExpectedNumberOfParameters(self): + batch_size = 5 + height, width = 299, 299 + inputs = tf.random_uniform((batch_size, height, width, 3)) + with slim.arg_scope(inception.inception_v3_arg_scope()): + inception.inception_v3_base(inputs) + total_params, _ = slim.model_analyzer.analyze_vars( + slim.get_model_variables()) + self.assertAlmostEqual(21802784, total_params) + + def testBuildEndPoints(self): + batch_size = 5 + height, width = 299, 299 + num_classes = 1000 + + inputs = tf.random_uniform((batch_size, height, width, 3)) + _, end_points = inception.inception_v3(inputs, num_classes) + self.assertTrue('Logits' in end_points) + logits = end_points['Logits'] + self.assertListEqual(logits.get_shape().as_list(), + [batch_size, num_classes]) + self.assertTrue('AuxLogits' in end_points) + aux_logits = end_points['AuxLogits'] + self.assertListEqual(aux_logits.get_shape().as_list(), + [batch_size, num_classes]) + self.assertTrue('Mixed_7c' in end_points) + pre_pool = end_points['Mixed_7c'] + self.assertListEqual(pre_pool.get_shape().as_list(), + [batch_size, 8, 8, 2048]) + self.assertTrue('PreLogits' in end_points) + pre_logits = end_points['PreLogits'] + self.assertListEqual(pre_logits.get_shape().as_list(), + [batch_size, 1, 1, 2048]) + + def testBuildEndPointsWithDepthMultiplierLessThanOne(self): + batch_size = 5 + height, width = 299, 299 + num_classes = 1000 + + inputs = tf.random_uniform((batch_size, height, width, 3)) + _, end_points = inception.inception_v3(inputs, num_classes) + + endpoint_keys = [key for key in end_points.keys() + if key.startswith('Mixed') or key.startswith('Conv')] + + _, end_points_with_multiplier = inception.inception_v3( + inputs, num_classes, scope='depth_multiplied_net', + depth_multiplier=0.5) + + for key in endpoint_keys: + original_depth = end_points[key].get_shape().as_list()[3] + new_depth = end_points_with_multiplier[key].get_shape().as_list()[3] + self.assertEqual(0.5 * original_depth, new_depth) + + def testBuildEndPointsWithDepthMultiplierGreaterThanOne(self): + batch_size = 5 + height, width = 299, 299 + num_classes = 1000 + + inputs = tf.random_uniform((batch_size, height, width, 3)) + _, end_points = inception.inception_v3(inputs, num_classes) + + endpoint_keys = [key for key in end_points.keys() + if key.startswith('Mixed') or key.startswith('Conv')] + + _, end_points_with_multiplier = inception.inception_v3( + inputs, num_classes, scope='depth_multiplied_net', + depth_multiplier=2.0) + + for key in endpoint_keys: + original_depth = end_points[key].get_shape().as_list()[3] + new_depth = end_points_with_multiplier[key].get_shape().as_list()[3] + self.assertEqual(2.0 * original_depth, new_depth) + + def testRaiseValueErrorWithInvalidDepthMultiplier(self): + batch_size = 5 + height, width = 299, 299 + num_classes = 1000 + + inputs = tf.random_uniform((batch_size, height, width, 3)) + with self.assertRaises(ValueError): + _ = inception.inception_v3(inputs, num_classes, depth_multiplier=-0.1) + with self.assertRaises(ValueError): + _ = inception.inception_v3(inputs, num_classes, depth_multiplier=0.0) + + def testHalfSizeImages(self): + batch_size = 5 + height, width = 150, 150 + num_classes = 1000 + + inputs = tf.random_uniform((batch_size, height, width, 3)) + logits, end_points = inception.inception_v3(inputs, num_classes) + self.assertTrue(logits.op.name.startswith('InceptionV3/Logits')) + self.assertListEqual(logits.get_shape().as_list(), + [batch_size, num_classes]) + pre_pool = end_points['Mixed_7c'] + self.assertListEqual(pre_pool.get_shape().as_list(), + [batch_size, 3, 3, 2048]) + + def testUnknownImageShape(self): + tf.reset_default_graph() + batch_size = 2 + height, width = 299, 299 + num_classes = 1000 + input_np = np.random.uniform(0, 1, (batch_size, height, width, 3)) + with self.test_session() as sess: + inputs = tf.placeholder(tf.float32, shape=(batch_size, None, None, 3)) + logits, end_points = inception.inception_v3(inputs, num_classes) + self.assertListEqual(logits.get_shape().as_list(), + [batch_size, num_classes]) + pre_pool = end_points['Mixed_7c'] + feed_dict = {inputs: input_np} + tf.initialize_all_variables().run() + pre_pool_out = sess.run(pre_pool, feed_dict=feed_dict) + self.assertListEqual(list(pre_pool_out.shape), [batch_size, 8, 8, 2048]) + + def testUnknowBatchSize(self): + batch_size = 1 + height, width = 299, 299 + num_classes = 1000 + + inputs = tf.placeholder(tf.float32, (None, height, width, 3)) + logits, _ = inception.inception_v3(inputs, num_classes) + self.assertTrue(logits.op.name.startswith('InceptionV3/Logits')) + self.assertListEqual(logits.get_shape().as_list(), + [None, num_classes]) + images = tf.random_uniform((batch_size, height, width, 3)) + + with self.test_session() as sess: + sess.run(tf.initialize_all_variables()) + output = sess.run(logits, {inputs: images.eval()}) + self.assertEquals(output.shape, (batch_size, num_classes)) + + def testEvaluation(self): + batch_size = 2 + height, width = 299, 299 + num_classes = 1000 + + eval_inputs = tf.random_uniform((batch_size, height, width, 3)) + logits, _ = inception.inception_v3(eval_inputs, num_classes, + is_training=False) + predictions = tf.argmax(logits, 1) + + with self.test_session() as sess: + sess.run(tf.initialize_all_variables()) + output = sess.run(predictions) + self.assertEquals(output.shape, (batch_size,)) + + def testTrainEvalWithReuse(self): + train_batch_size = 5 + eval_batch_size = 2 + height, width = 150, 150 + num_classes = 1000 + + train_inputs = tf.random_uniform((train_batch_size, height, width, 3)) + inception.inception_v3(train_inputs, num_classes) + eval_inputs = tf.random_uniform((eval_batch_size, height, width, 3)) + logits, _ = inception.inception_v3(eval_inputs, num_classes, + is_training=False, reuse=True) + predictions = tf.argmax(logits, 1) + + with self.test_session() as sess: + sess.run(tf.initialize_all_variables()) + output = sess.run(predictions) + self.assertEquals(output.shape, (eval_batch_size,)) + + def testLogitsNotSqueezed(self): + num_classes = 25 + images = tf.random_uniform([1, 299, 299, 3]) + logits, _ = inception.inception_v3(images, + num_classes=num_classes, + spatial_squeeze=False) + + with self.test_session() as sess: + tf.initialize_all_variables().run() + logits_out = sess.run(logits) + self.assertListEqual(list(logits_out.shape), [1, 1, 1, num_classes]) + + +if __name__ == '__main__': + tf.test.main() diff --git a/nets/lenet.py b/nets/lenet.py new file mode 100644 index 0000000..789d2bd --- /dev/null +++ b/nets/lenet.py @@ -0,0 +1,93 @@ +# Copyright 2016 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Contains a variant of the LeNet model definition.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import tensorflow as tf + +slim = tf.contrib.slim + + +def lenet(images, num_classes=10, is_training=False, + dropout_keep_prob=0.5, + prediction_fn=slim.softmax, + scope='LeNet'): + """Creates a variant of the LeNet model. + + Note that since the output is a set of 'logits', the values fall in the + interval of (-infinity, infinity). Consequently, to convert the outputs to a + probability distribution over the characters, one will need to convert them + using the softmax function: + + logits = lenet.lenet(images, is_training=False) + probabilities = tf.nn.softmax(logits) + predictions = tf.argmax(logits, 1) + + Args: + images: A batch of `Tensors` of size [batch_size, height, width, channels]. + num_classes: the number of classes in the dataset. + is_training: specifies whether or not we're currently training the model. + This variable will determine the behaviour of the dropout layer. + dropout_keep_prob: the percentage of activation values that are retained. + prediction_fn: a function to get predictions out of logits. + scope: Optional variable_scope. + + Returns: + logits: the pre-softmax activations, a tensor of size + [batch_size, `num_classes`] + end_points: a dictionary from components of the network to the corresponding + activation. + """ + end_points = {} + + with tf.variable_scope(scope, 'LeNet', [images, num_classes]): + net = slim.conv2d(images, 32, [5, 5], scope='conv1') + net = slim.max_pool2d(net, [2, 2], 2, scope='pool1') + net = slim.conv2d(net, 64, [5, 5], scope='conv2') + net = slim.max_pool2d(net, [2, 2], 2, scope='pool2') + net = slim.flatten(net) + end_points['Flatten'] = net + + net = slim.fully_connected(net, 1024, scope='fc3') + net = slim.dropout(net, dropout_keep_prob, is_training=is_training, + scope='dropout3') + logits = slim.fully_connected(net, num_classes, activation_fn=None, + scope='fc4') + + end_points['Logits'] = logits + end_points['Predictions'] = prediction_fn(logits, scope='Predictions') + + return logits, end_points +lenet.default_image_size = 28 + + +def lenet_arg_scope(weight_decay=0.0): + """Defines the default lenet argument scope. + + Args: + weight_decay: The weight decay to use for regularizing the model. + + Returns: + An `arg_scope` to use for the inception v3 model. + """ + with slim.arg_scope( + [slim.conv2d, slim.fully_connected], + weights_regularizer=slim.l2_regularizer(weight_decay), + weights_initializer=tf.truncated_normal_initializer(stddev=0.1), + activation_fn=tf.nn.relu) as sc: + return sc diff --git a/nets/nets_factory.py b/nets/nets_factory.py new file mode 100644 index 0000000..5268372 --- /dev/null +++ b/nets/nets_factory.py @@ -0,0 +1,107 @@ +# Copyright 2016 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Contains a factory for building various models.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +import functools + +import tensorflow as tf + +from nets import alexnet +from nets import cifarnet +from nets import inception +from nets import lenet +from nets import overfeat +from nets import resnet_v1 +from nets import resnet_v2 +from nets import vgg + +slim = tf.contrib.slim + +networks_map = {'alexnet_v2': alexnet.alexnet_v2, + 'cifarnet': cifarnet.cifarnet, + 'overfeat': overfeat.overfeat, + 'vgg_a': vgg.vgg_a, + 'vgg_16': vgg.vgg_16, + 'vgg_19': vgg.vgg_19, + 'inception_v1': inception.inception_v1, + 'inception_v2': inception.inception_v2, + 'inception_v3': inception.inception_v3, + 'inception_resnet_v2': inception.inception_resnet_v2, + 'lenet': lenet.lenet, + 'resnet_v1_50': resnet_v1.resnet_v1_50, + 'resnet_v1_101': resnet_v1.resnet_v1_101, + 'resnet_v1_152': resnet_v1.resnet_v1_152, + 'resnet_v1_200': resnet_v1.resnet_v1_200, + 'resnet_v2_50': resnet_v2.resnet_v2_50, + 'resnet_v2_101': resnet_v2.resnet_v2_101, + 'resnet_v2_152': resnet_v2.resnet_v2_152, + 'resnet_v2_200': resnet_v2.resnet_v2_200, + } + +arg_scopes_map = {'alexnet_v2': alexnet.alexnet_v2_arg_scope, + 'cifarnet': cifarnet.cifarnet_arg_scope, + 'overfeat': overfeat.overfeat_arg_scope, + 'vgg_a': vgg.vgg_arg_scope, + 'vgg_16': vgg.vgg_arg_scope, + 'vgg_19': vgg.vgg_arg_scope, + 'inception_v1': inception.inception_v3_arg_scope, + 'inception_v2': inception.inception_v3_arg_scope, + 'inception_v3': inception.inception_v3_arg_scope, + 'inception_resnet_v2': + inception.inception_resnet_v2_arg_scope, + 'lenet': lenet.lenet_arg_scope, + 'resnet_v1_50': resnet_v1.resnet_arg_scope, + 'resnet_v1_101': resnet_v1.resnet_arg_scope, + 'resnet_v1_152': resnet_v1.resnet_arg_scope, + 'resnet_v1_200': resnet_v1.resnet_arg_scope, + 'resnet_v2_50': resnet_v2.resnet_arg_scope, + 'resnet_v2_101': resnet_v2.resnet_arg_scope, + 'resnet_v2_152': resnet_v2.resnet_arg_scope, + 'resnet_v2_200': resnet_v2.resnet_arg_scope, + } + + +def get_network_fn(name, num_classes, weight_decay=0.0, is_training=False): + """Returns a network_fn such as `logits, end_points = network_fn(images)`. + + Args: + name: The name of the network. + num_classes: The number of classes to use for classification. + weight_decay: The l2 coefficient for the model weights. + is_training: `True` if the model is being used for training and `False` + otherwise. + + Returns: + network_fn: A function that applies the model to a batch of images. It has + the following signature: + logits, end_points = network_fn(images) + Raises: + ValueError: If network `name` is not recognized. + """ + if name not in networks_map: + raise ValueError('Name of network unknown %s' % name) + arg_scope = arg_scopes_map[name](weight_decay=weight_decay) + func = networks_map[name] + @functools.wraps(func) + def network_fn(images): + with slim.arg_scope(arg_scope): + return func(images, num_classes, is_training=is_training) + if hasattr(func, 'default_image_size'): + network_fn.default_image_size = func.default_image_size + + return network_fn diff --git a/nets/nets_factory_test.py b/nets/nets_factory_test.py new file mode 100644 index 0000000..6ac723b --- /dev/null +++ b/nets/nets_factory_test.py @@ -0,0 +1,46 @@ +# Copyright 2016 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Tests for slim.inception.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + + +import tensorflow as tf + +from nets import nets_factory + + +class NetworksTest(tf.test.TestCase): + + def testGetNetworkFn(self): + batch_size = 5 + num_classes = 1000 + for net in nets_factory.networks_map: + with self.test_session(): + net_fn = nets_factory.get_network_fn(net, num_classes) + # Most networks use 224 as their default_image_size + image_size = getattr(net_fn, 'default_image_size', 224) + inputs = tf.random_uniform((batch_size, image_size, image_size, 3)) + logits, end_points = net_fn(inputs) + self.assertTrue(isinstance(logits, tf.Tensor)) + self.assertTrue(isinstance(end_points, dict)) + self.assertEqual(logits.get_shape().as_list()[0], batch_size) + self.assertEqual(logits.get_shape().as_list()[-1], num_classes) + +if __name__ == '__main__': + tf.test.main() diff --git a/nets/overfeat.py b/nets/overfeat.py new file mode 100644 index 0000000..8ef566b --- /dev/null +++ b/nets/overfeat.py @@ -0,0 +1,118 @@ +# Copyright 2016 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Contains the model definition for the OverFeat network. + +The definition for the network was obtained from: + OverFeat: Integrated Recognition, Localization and Detection using + Convolutional Networks + Pierre Sermanet, David Eigen, Xiang Zhang, Michael Mathieu, Rob Fergus and + Yann LeCun, 2014 + http://arxiv.org/abs/1312.6229 + +Usage: + with slim.arg_scope(overfeat.overfeat_arg_scope()): + outputs, end_points = overfeat.overfeat(inputs) + +@@overfeat +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import tensorflow as tf + +slim = tf.contrib.slim +trunc_normal = lambda stddev: tf.truncated_normal_initializer(0.0, stddev) + + +def overfeat_arg_scope(weight_decay=0.0005): + with slim.arg_scope([slim.conv2d, slim.fully_connected], + activation_fn=tf.nn.relu, + weights_regularizer=slim.l2_regularizer(weight_decay), + biases_initializer=tf.zeros_initializer): + with slim.arg_scope([slim.conv2d], padding='SAME'): + with slim.arg_scope([slim.max_pool2d], padding='VALID') as arg_sc: + return arg_sc + + +def overfeat(inputs, + num_classes=1000, + is_training=True, + dropout_keep_prob=0.5, + spatial_squeeze=True, + scope='overfeat'): + """Contains the model definition for the OverFeat network. + + The definition for the network was obtained from: + OverFeat: Integrated Recognition, Localization and Detection using + Convolutional Networks + Pierre Sermanet, David Eigen, Xiang Zhang, Michael Mathieu, Rob Fergus and + Yann LeCun, 2014 + http://arxiv.org/abs/1312.6229 + + Note: All the fully_connected layers have been transformed to conv2d layers. + To use in classification mode, resize input to 231x231. To use in fully + convolutional mode, set spatial_squeeze to false. + + Args: + inputs: a tensor of size [batch_size, height, width, channels]. + num_classes: number of predicted classes. + is_training: whether or not the model is being trained. + dropout_keep_prob: the probability that activations are kept in the dropout + layers during training. + spatial_squeeze: whether or not should squeeze the spatial dimensions of the + outputs. Useful to remove unnecessary dimensions for classification. + scope: Optional scope for the variables. + + Returns: + the last op containing the log predictions and end_points dict. + + """ + with tf.variable_scope(scope, 'overfeat', [inputs]) as sc: + end_points_collection = sc.name + '_end_points' + # Collect outputs for conv2d, fully_connected and max_pool2d + with slim.arg_scope([slim.conv2d, slim.fully_connected, slim.max_pool2d], + outputs_collections=end_points_collection): + net = slim.conv2d(inputs, 64, [11, 11], 4, padding='VALID', + scope='conv1') + net = slim.max_pool2d(net, [2, 2], scope='pool1') + net = slim.conv2d(net, 256, [5, 5], padding='VALID', scope='conv2') + net = slim.max_pool2d(net, [2, 2], scope='pool2') + net = slim.conv2d(net, 512, [3, 3], scope='conv3') + net = slim.conv2d(net, 1024, [3, 3], scope='conv4') + net = slim.conv2d(net, 1024, [3, 3], scope='conv5') + net = slim.max_pool2d(net, [2, 2], scope='pool5') + with slim.arg_scope([slim.conv2d], + weights_initializer=trunc_normal(0.005), + biases_initializer=tf.constant_initializer(0.1)): + # Use conv2d instead of fully_connected layers. + net = slim.conv2d(net, 3072, [6, 6], padding='VALID', scope='fc6') + net = slim.dropout(net, dropout_keep_prob, is_training=is_training, + scope='dropout6') + net = slim.conv2d(net, 4096, [1, 1], scope='fc7') + net = slim.dropout(net, dropout_keep_prob, is_training=is_training, + scope='dropout7') + net = slim.conv2d(net, num_classes, [1, 1], + activation_fn=None, + normalizer_fn=None, + biases_initializer=tf.zeros_initializer, + scope='fc8') + # Convert end_points_collection into a end_point dict. + end_points = dict(tf.get_collection(end_points_collection)) + if spatial_squeeze: + net = tf.squeeze(net, [1, 2], name='fc8/squeezed') + end_points[sc.name + '/fc8'] = net + return net, end_points +overfeat.default_image_size = 231 diff --git a/nets/overfeat_test.py b/nets/overfeat_test.py new file mode 100644 index 0000000..71dbc9a --- /dev/null +++ b/nets/overfeat_test.py @@ -0,0 +1,145 @@ +# Copyright 2016 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for slim.nets.overfeat.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import tensorflow as tf + +from nets import overfeat + +slim = tf.contrib.slim + + +class OverFeatTest(tf.test.TestCase): + + def testBuild(self): + batch_size = 5 + height, width = 231, 231 + num_classes = 1000 + with self.test_session(): + inputs = tf.random_uniform((batch_size, height, width, 3)) + logits, _ = overfeat.overfeat(inputs, num_classes) + self.assertEquals(logits.op.name, 'overfeat/fc8/squeezed') + self.assertListEqual(logits.get_shape().as_list(), + [batch_size, num_classes]) + + def testFullyConvolutional(self): + batch_size = 1 + height, width = 281, 281 + num_classes = 1000 + with self.test_session(): + inputs = tf.random_uniform((batch_size, height, width, 3)) + logits, _ = overfeat.overfeat(inputs, num_classes, spatial_squeeze=False) + self.assertEquals(logits.op.name, 'overfeat/fc8/BiasAdd') + self.assertListEqual(logits.get_shape().as_list(), + [batch_size, 2, 2, num_classes]) + + def testEndPoints(self): + batch_size = 5 + height, width = 231, 231 + num_classes = 1000 + with self.test_session(): + inputs = tf.random_uniform((batch_size, height, width, 3)) + _, end_points = overfeat.overfeat(inputs, num_classes) + expected_names = ['overfeat/conv1', + 'overfeat/pool1', + 'overfeat/conv2', + 'overfeat/pool2', + 'overfeat/conv3', + 'overfeat/conv4', + 'overfeat/conv5', + 'overfeat/pool5', + 'overfeat/fc6', + 'overfeat/fc7', + 'overfeat/fc8' + ] + self.assertSetEqual(set(end_points.keys()), set(expected_names)) + + def testModelVariables(self): + batch_size = 5 + height, width = 231, 231 + num_classes = 1000 + with self.test_session(): + inputs = tf.random_uniform((batch_size, height, width, 3)) + overfeat.overfeat(inputs, num_classes) + expected_names = ['overfeat/conv1/weights', + 'overfeat/conv1/biases', + 'overfeat/conv2/weights', + 'overfeat/conv2/biases', + 'overfeat/conv3/weights', + 'overfeat/conv3/biases', + 'overfeat/conv4/weights', + 'overfeat/conv4/biases', + 'overfeat/conv5/weights', + 'overfeat/conv5/biases', + 'overfeat/fc6/weights', + 'overfeat/fc6/biases', + 'overfeat/fc7/weights', + 'overfeat/fc7/biases', + 'overfeat/fc8/weights', + 'overfeat/fc8/biases', + ] + model_variables = [v.op.name for v in slim.get_model_variables()] + self.assertSetEqual(set(model_variables), set(expected_names)) + + def testEvaluation(self): + batch_size = 2 + height, width = 231, 231 + num_classes = 1000 + with self.test_session(): + eval_inputs = tf.random_uniform((batch_size, height, width, 3)) + logits, _ = overfeat.overfeat(eval_inputs, is_training=False) + self.assertListEqual(logits.get_shape().as_list(), + [batch_size, num_classes]) + predictions = tf.argmax(logits, 1) + self.assertListEqual(predictions.get_shape().as_list(), [batch_size]) + + def testTrainEvalWithReuse(self): + train_batch_size = 2 + eval_batch_size = 1 + train_height, train_width = 231, 231 + eval_height, eval_width = 281, 281 + num_classes = 1000 + with self.test_session(): + train_inputs = tf.random_uniform( + (train_batch_size, train_height, train_width, 3)) + logits, _ = overfeat.overfeat(train_inputs) + self.assertListEqual(logits.get_shape().as_list(), + [train_batch_size, num_classes]) + tf.get_variable_scope().reuse_variables() + eval_inputs = tf.random_uniform( + (eval_batch_size, eval_height, eval_width, 3)) + logits, _ = overfeat.overfeat(eval_inputs, is_training=False, + spatial_squeeze=False) + self.assertListEqual(logits.get_shape().as_list(), + [eval_batch_size, 2, 2, num_classes]) + logits = tf.reduce_mean(logits, [1, 2]) + predictions = tf.argmax(logits, 1) + self.assertEquals(predictions.get_shape().as_list(), [eval_batch_size]) + + def testForward(self): + batch_size = 1 + height, width = 231, 231 + with self.test_session() as sess: + inputs = tf.random_uniform((batch_size, height, width, 3)) + logits, _ = overfeat.overfeat(inputs) + sess.run(tf.initialize_all_variables()) + output = sess.run(logits) + self.assertTrue(output.any()) + +if __name__ == '__main__': + tf.test.main() diff --git a/nets/resnet_utils.py b/nets/resnet_utils.py new file mode 100644 index 0000000..1e1dd82 --- /dev/null +++ b/nets/resnet_utils.py @@ -0,0 +1,254 @@ +# Copyright 2016 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Contains building blocks for various versions of Residual Networks. + +Residual networks (ResNets) were proposed in: + Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun + Deep Residual Learning for Image Recognition. arXiv:1512.03385, 2015 + +More variants were introduced in: + Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun + Identity Mappings in Deep Residual Networks. arXiv: 1603.05027, 2016 + +We can obtain different ResNet variants by changing the network depth, width, +and form of residual unit. This module implements the infrastructure for +building them. Concrete ResNet units and full ResNet networks are implemented in +the accompanying resnet_v1.py and resnet_v2.py modules. + +Compared to https://github.com/KaimingHe/deep-residual-networks, in the current +implementation we subsample the output activations in the last residual unit of +each block, instead of subsampling the input activations in the first residual +unit of each block. The two implementations give identical results but our +implementation is more memory efficient. +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections +import tensorflow as tf + +slim = tf.contrib.slim + + +class Block(collections.namedtuple('Block', ['scope', 'unit_fn', 'args'])): + """A named tuple describing a ResNet block. + + Its parts are: + scope: The scope of the `Block`. + unit_fn: The ResNet unit function which takes as input a `Tensor` and + returns another `Tensor` with the output of the ResNet unit. + args: A list of length equal to the number of units in the `Block`. The list + contains one (depth, depth_bottleneck, stride) tuple for each unit in the + block to serve as argument to unit_fn. + """ + + +def subsample(inputs, factor, scope=None): + """Subsamples the input along the spatial dimensions. + + Args: + inputs: A `Tensor` of size [batch, height_in, width_in, channels]. + factor: The subsampling factor. + scope: Optional variable_scope. + + Returns: + output: A `Tensor` of size [batch, height_out, width_out, channels] with the + input, either intact (if factor == 1) or subsampled (if factor > 1). + """ + if factor == 1: + return inputs + else: + return slim.max_pool2d(inputs, [1, 1], stride=factor, scope=scope) + + +def conv2d_same(inputs, num_outputs, kernel_size, stride, rate=1, scope=None): + """Strided 2-D convolution with 'SAME' padding. + + When stride > 1, then we do explicit zero-padding, followed by conv2d with + 'VALID' padding. + + Note that + + net = conv2d_same(inputs, num_outputs, 3, stride=stride) + + is equivalent to + + net = slim.conv2d(inputs, num_outputs, 3, stride=1, padding='SAME') + net = subsample(net, factor=stride) + + whereas + + net = slim.conv2d(inputs, num_outputs, 3, stride=stride, padding='SAME') + + is different when the input's height or width is even, which is why we add the + current function. For more details, see ResnetUtilsTest.testConv2DSameEven(). + + Args: + inputs: A 4-D tensor of size [batch, height_in, width_in, channels]. + num_outputs: An integer, the number of output filters. + kernel_size: An int with the kernel_size of the filters. + stride: An integer, the output stride. + rate: An integer, rate for atrous convolution. + scope: Scope. + + Returns: + output: A 4-D tensor of size [batch, height_out, width_out, channels] with + the convolution output. + """ + if stride == 1: + return slim.conv2d(inputs, num_outputs, kernel_size, stride=1, rate=rate, + padding='SAME', scope=scope) + else: + kernel_size_effective = kernel_size + (kernel_size - 1) * (rate - 1) + pad_total = kernel_size_effective - 1 + pad_beg = pad_total // 2 + pad_end = pad_total - pad_beg + inputs = tf.pad(inputs, + [[0, 0], [pad_beg, pad_end], [pad_beg, pad_end], [0, 0]]) + return slim.conv2d(inputs, num_outputs, kernel_size, stride=stride, + rate=rate, padding='VALID', scope=scope) + + +@slim.add_arg_scope +def stack_blocks_dense(net, blocks, output_stride=None, + outputs_collections=None): + """Stacks ResNet `Blocks` and controls output feature density. + + First, this function creates scopes for the ResNet in the form of + 'block_name/unit_1', 'block_name/unit_2', etc. + + Second, this function allows the user to explicitly control the ResNet + output_stride, which is the ratio of the input to output spatial resolution. + This is useful for dense prediction tasks such as semantic segmentation or + object detection. + + Most ResNets consist of 4 ResNet blocks and subsample the activations by a + factor of 2 when transitioning between consecutive ResNet blocks. This results + to a nominal ResNet output_stride equal to 8. If we set the output_stride to + half the nominal network stride (e.g., output_stride=4), then we compute + responses twice. + + Control of the output feature density is implemented by atrous convolution. + + Args: + net: A `Tensor` of size [batch, height, width, channels]. + blocks: A list of length equal to the number of ResNet `Blocks`. Each + element is a ResNet `Block` object describing the units in the `Block`. + output_stride: If `None`, then the output will be computed at the nominal + network stride. If output_stride is not `None`, it specifies the requested + ratio of input to output spatial resolution, which needs to be equal to + the product of unit strides from the start up to some level of the ResNet. + For example, if the ResNet employs units with strides 1, 2, 1, 3, 4, 1, + then valid values for the output_stride are 1, 2, 6, 24 or None (which + is equivalent to output_stride=24). + outputs_collections: Collection to add the ResNet block outputs. + + Returns: + net: Output tensor with stride equal to the specified output_stride. + + Raises: + ValueError: If the target output_stride is not valid. + """ + # The current_stride variable keeps track of the effective stride of the + # activations. This allows us to invoke atrous convolution whenever applying + # the next residual unit would result in the activations having stride larger + # than the target output_stride. + current_stride = 1 + + # The atrous convolution rate parameter. + rate = 1 + + for block in blocks: + with tf.variable_scope(block.scope, 'block', [net]) as sc: + for i, unit in enumerate(block.args): + if output_stride is not None and current_stride > output_stride: + raise ValueError('The target output_stride cannot be reached.') + + with tf.variable_scope('unit_%d' % (i + 1), values=[net]): + unit_depth, unit_depth_bottleneck, unit_stride = unit + + # If we have reached the target output_stride, then we need to employ + # atrous convolution with stride=1 and multiply the atrous rate by the + # current unit's stride for use in subsequent layers. + if output_stride is not None and current_stride == output_stride: + net = block.unit_fn(net, + depth=unit_depth, + depth_bottleneck=unit_depth_bottleneck, + stride=1, + rate=rate) + rate *= unit_stride + + else: + net = block.unit_fn(net, + depth=unit_depth, + depth_bottleneck=unit_depth_bottleneck, + stride=unit_stride, + rate=1) + current_stride *= unit_stride + net = slim.utils.collect_named_outputs(outputs_collections, sc.name, net) + + if output_stride is not None and current_stride != output_stride: + raise ValueError('The target output_stride cannot be reached.') + + return net + + +def resnet_arg_scope(weight_decay=0.0001, + batch_norm_decay=0.997, + batch_norm_epsilon=1e-5, + batch_norm_scale=True): + """Defines the default ResNet arg scope. + + TODO(gpapan): The batch-normalization related default values above are + appropriate for use in conjunction with the reference ResNet models + released at https://github.com/KaimingHe/deep-residual-networks. When + training ResNets from scratch, they might need to be tuned. + + Args: + weight_decay: The weight decay to use for regularizing the model. + batch_norm_decay: The moving average decay when estimating layer activation + statistics in batch normalization. + batch_norm_epsilon: Small constant to prevent division by zero when + normalizing activations by their variance in batch normalization. + batch_norm_scale: If True, uses an explicit `gamma` multiplier to scale the + activations in the batch normalization layer. + + Returns: + An `arg_scope` to use for the resnet models. + """ + batch_norm_params = { + 'decay': batch_norm_decay, + 'epsilon': batch_norm_epsilon, + 'scale': batch_norm_scale, + 'updates_collections': tf.GraphKeys.UPDATE_OPS, + } + + with slim.arg_scope( + [slim.conv2d], + weights_regularizer=slim.l2_regularizer(weight_decay), + weights_initializer=slim.variance_scaling_initializer(), + activation_fn=tf.nn.relu, + normalizer_fn=slim.batch_norm, + normalizer_params=batch_norm_params): + with slim.arg_scope([slim.batch_norm], **batch_norm_params): + # The following implies padding='SAME' for pool1, which makes feature + # alignment easier for dense prediction tasks. This is also used in + # https://github.com/facebook/fb.resnet.torch. However the accompanying + # code of 'Deep Residual Learning for Image Recognition' uses + # padding='VALID' for pool1. You can switch to that choice by setting + # slim.arg_scope([slim.max_pool2d], padding='VALID'). + with slim.arg_scope([slim.max_pool2d], padding='SAME') as arg_sc: + return arg_sc diff --git a/nets/resnet_v1.py b/nets/resnet_v1.py new file mode 100644 index 0000000..ce7f147 --- /dev/null +++ b/nets/resnet_v1.py @@ -0,0 +1,295 @@ +# Copyright 2016 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Contains definitions for the original form of Residual Networks. + +The 'v1' residual networks (ResNets) implemented in this module were proposed +by: +[1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun + Deep Residual Learning for Image Recognition. arXiv:1512.03385 + +Other variants were introduced in: +[2] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun + Identity Mappings in Deep Residual Networks. arXiv: 1603.05027 + +The networks defined in this module utilize the bottleneck building block of +[1] with projection shortcuts only for increasing depths. They employ batch +normalization *after* every weight layer. This is the architecture used by +MSRA in the Imagenet and MSCOCO 2016 competition models ResNet-101 and +ResNet-152. See [2; Fig. 1a] for a comparison between the current 'v1' +architecture and the alternative 'v2' architecture of [2] which uses batch +normalization *before* every weight layer in the so-called full pre-activation +units. + +Typical use: + + from tensorflow.contrib.slim.nets import resnet_v1 + +ResNet-101 for image classification into 1000 classes: + + # inputs has shape [batch, 224, 224, 3] + with slim.arg_scope(resnet_v1.resnet_arg_scope()): + net, end_points = resnet_v1.resnet_v1_101(inputs, 1000, is_training=False) + +ResNet-101 for semantic segmentation into 21 classes: + + # inputs has shape [batch, 513, 513, 3] + with slim.arg_scope(resnet_v1.resnet_arg_scope()): + net, end_points = resnet_v1.resnet_v1_101(inputs, + 21, + is_training=False, + global_pool=False, + output_stride=16) +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import tensorflow as tf + +from nets import resnet_utils + + +resnet_arg_scope = resnet_utils.resnet_arg_scope +slim = tf.contrib.slim + + +@slim.add_arg_scope +def bottleneck(inputs, depth, depth_bottleneck, stride, rate=1, + outputs_collections=None, scope=None): + """Bottleneck residual unit variant with BN after convolutions. + + This is the original residual unit proposed in [1]. See Fig. 1(a) of [2] for + its definition. Note that we use here the bottleneck variant which has an + extra bottleneck layer. + + When putting together two consecutive ResNet blocks that use this unit, one + should use stride = 2 in the last unit of the first block. + + Args: + inputs: A tensor of size [batch, height, width, channels]. + depth: The depth of the ResNet unit output. + depth_bottleneck: The depth of the bottleneck layers. + stride: The ResNet unit's stride. Determines the amount of downsampling of + the units output compared to its input. + rate: An integer, rate for atrous convolution. + outputs_collections: Collection to add the ResNet unit output. + scope: Optional variable_scope. + + Returns: + The ResNet unit's output. + """ + with tf.variable_scope(scope, 'bottleneck_v1', [inputs]) as sc: + depth_in = slim.utils.last_dimension(inputs.get_shape(), min_rank=4) + if depth == depth_in: + shortcut = resnet_utils.subsample(inputs, stride, 'shortcut') + else: + shortcut = slim.conv2d(inputs, depth, [1, 1], stride=stride, + activation_fn=None, scope='shortcut') + + residual = slim.conv2d(inputs, depth_bottleneck, [1, 1], stride=1, + scope='conv1') + residual = resnet_utils.conv2d_same(residual, depth_bottleneck, 3, stride, + rate=rate, scope='conv2') + residual = slim.conv2d(residual, depth, [1, 1], stride=1, + activation_fn=None, scope='conv3') + + output = tf.nn.relu(shortcut + residual) + + return slim.utils.collect_named_outputs(outputs_collections, + sc.original_name_scope, + output) + + +def resnet_v1(inputs, + blocks, + num_classes=None, + is_training=True, + global_pool=True, + output_stride=None, + include_root_block=True, + reuse=None, + scope=None): + """Generator for v1 ResNet models. + + This function generates a family of ResNet v1 models. See the resnet_v1_*() + methods for specific model instantiations, obtained by selecting different + block instantiations that produce ResNets of various depths. + + Training for image classification on Imagenet is usually done with [224, 224] + inputs, resulting in [7, 7] feature maps at the output of the last ResNet + block for the ResNets defined in [1] that have nominal stride equal to 32. + However, for dense prediction tasks we advise that one uses inputs with + spatial dimensions that are multiples of 32 plus 1, e.g., [321, 321]. In + this case the feature maps at the ResNet output will have spatial shape + [(height - 1) / output_stride + 1, (width - 1) / output_stride + 1] + and corners exactly aligned with the input image corners, which greatly + facilitates alignment of the features to the image. Using as input [225, 225] + images results in [8, 8] feature maps at the output of the last ResNet block. + + For dense prediction tasks, the ResNet needs to run in fully-convolutional + (FCN) mode and global_pool needs to be set to False. The ResNets in [1, 2] all + have nominal stride equal to 32 and a good choice in FCN mode is to use + output_stride=16 in order to increase the density of the computed features at + small computational and memory overhead, cf. http://arxiv.org/abs/1606.00915. + + Args: + inputs: A tensor of size [batch, height_in, width_in, channels]. + blocks: A list of length equal to the number of ResNet blocks. Each element + is a resnet_utils.Block object describing the units in the block. + num_classes: Number of predicted classes for classification tasks. If None + we return the features before the logit layer. + is_training: whether is training or not. + global_pool: If True, we perform global average pooling before computing the + logits. Set to True for image classification, False for dense prediction. + output_stride: If None, then the output will be computed at the nominal + network stride. If output_stride is not None, it specifies the requested + ratio of input to output spatial resolution. + include_root_block: If True, include the initial convolution followed by + max-pooling, if False excludes it. + reuse: whether or not the network and its variables should be reused. To be + able to reuse 'scope' must be given. + scope: Optional variable_scope. + + Returns: + net: A rank-4 tensor of size [batch, height_out, width_out, channels_out]. + If global_pool is False, then height_out and width_out are reduced by a + factor of output_stride compared to the respective height_in and width_in, + else both height_out and width_out equal one. If num_classes is None, then + net is the output of the last ResNet block, potentially after global + average pooling. If num_classes is not None, net contains the pre-softmax + activations. + end_points: A dictionary from components of the network to the corresponding + activation. + + Raises: + ValueError: If the target output_stride is not valid. + """ + with tf.variable_scope(scope, 'resnet_v1', [inputs], reuse=reuse) as sc: + end_points_collection = sc.name + '_end_points' + with slim.arg_scope([slim.conv2d, bottleneck, + resnet_utils.stack_blocks_dense], + outputs_collections=end_points_collection): + with slim.arg_scope([slim.batch_norm], is_training=is_training): + net = inputs + if include_root_block: + if output_stride is not None: + if output_stride % 4 != 0: + raise ValueError('The output_stride needs to be a multiple of 4.') + output_stride /= 4 + net = resnet_utils.conv2d_same(net, 64, 7, stride=2, scope='conv1') + net = slim.max_pool2d(net, [3, 3], stride=2, scope='pool1') + net = resnet_utils.stack_blocks_dense(net, blocks, output_stride) + if global_pool: + # Global average pooling. + net = tf.reduce_mean(net, [1, 2], name='pool5', keep_dims=True) + if num_classes is not None: + net = slim.conv2d(net, num_classes, [1, 1], activation_fn=None, + normalizer_fn=None, scope='logits') + # Convert end_points_collection into a dictionary of end_points. + end_points = dict(tf.get_collection(end_points_collection)) + if num_classes is not None: + end_points['predictions'] = slim.softmax(net, scope='predictions') + return net, end_points +resnet_v1.default_image_size = 224 + + +def resnet_v1_50(inputs, + num_classes=None, + is_training=True, + global_pool=True, + output_stride=None, + reuse=None, + scope='resnet_v1_50'): + """ResNet-50 model of [1]. See resnet_v1() for arg and return description.""" + blocks = [ + resnet_utils.Block( + 'block1', bottleneck, [(256, 64, 1)] * 2 + [(256, 64, 2)]), + resnet_utils.Block( + 'block2', bottleneck, [(512, 128, 1)] * 3 + [(512, 128, 2)]), + resnet_utils.Block( + 'block3', bottleneck, [(1024, 256, 1)] * 5 + [(1024, 256, 2)]), + resnet_utils.Block( + 'block4', bottleneck, [(2048, 512, 1)] * 3) + ] + return resnet_v1(inputs, blocks, num_classes, is_training, + global_pool=global_pool, output_stride=output_stride, + include_root_block=True, reuse=reuse, scope=scope) + + +def resnet_v1_101(inputs, + num_classes=None, + is_training=True, + global_pool=True, + output_stride=None, + reuse=None, + scope='resnet_v1_101'): + """ResNet-101 model of [1]. See resnet_v1() for arg and return description.""" + blocks = [ + resnet_utils.Block( + 'block1', bottleneck, [(256, 64, 1)] * 2 + [(256, 64, 2)]), + resnet_utils.Block( + 'block2', bottleneck, [(512, 128, 1)] * 3 + [(512, 128, 2)]), + resnet_utils.Block( + 'block3', bottleneck, [(1024, 256, 1)] * 22 + [(1024, 256, 2)]), + resnet_utils.Block( + 'block4', bottleneck, [(2048, 512, 1)] * 3) + ] + return resnet_v1(inputs, blocks, num_classes, is_training, + global_pool=global_pool, output_stride=output_stride, + include_root_block=True, reuse=reuse, scope=scope) + + +def resnet_v1_152(inputs, + num_classes=None, + is_training=True, + global_pool=True, + output_stride=None, + reuse=None, + scope='resnet_v1_152'): + """ResNet-152 model of [1]. See resnet_v1() for arg and return description.""" + blocks = [ + resnet_utils.Block( + 'block1', bottleneck, [(256, 64, 1)] * 2 + [(256, 64, 2)]), + resnet_utils.Block( + 'block2', bottleneck, [(512, 128, 1)] * 7 + [(512, 128, 2)]), + resnet_utils.Block( + 'block3', bottleneck, [(1024, 256, 1)] * 35 + [(1024, 256, 2)]), + resnet_utils.Block( + 'block4', bottleneck, [(2048, 512, 1)] * 3)] + return resnet_v1(inputs, blocks, num_classes, is_training, + global_pool=global_pool, output_stride=output_stride, + include_root_block=True, reuse=reuse, scope=scope) + + +def resnet_v1_200(inputs, + num_classes=None, + is_training=True, + global_pool=True, + output_stride=None, + reuse=None, + scope='resnet_v1_200'): + """ResNet-200 model of [2]. See resnet_v1() for arg and return description.""" + blocks = [ + resnet_utils.Block( + 'block1', bottleneck, [(256, 64, 1)] * 2 + [(256, 64, 2)]), + resnet_utils.Block( + 'block2', bottleneck, [(512, 128, 1)] * 23 + [(512, 128, 2)]), + resnet_utils.Block( + 'block3', bottleneck, [(1024, 256, 1)] * 35 + [(1024, 256, 2)]), + resnet_utils.Block( + 'block4', bottleneck, [(2048, 512, 1)] * 3)] + return resnet_v1(inputs, blocks, num_classes, is_training, + global_pool=global_pool, output_stride=output_stride, + include_root_block=True, reuse=reuse, scope=scope) diff --git a/nets/resnet_v1_test.py b/nets/resnet_v1_test.py new file mode 100644 index 0000000..6198212 --- /dev/null +++ b/nets/resnet_v1_test.py @@ -0,0 +1,450 @@ +# Copyright 2016 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for slim.nets.resnet_v1.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +import tensorflow as tf + +from nets import resnet_utils +from nets import resnet_v1 + +slim = tf.contrib.slim + + +def create_test_input(batch_size, height, width, channels): + """Create test input tensor. + + Args: + batch_size: The number of images per batch or `None` if unknown. + height: The height of each image or `None` if unknown. + width: The width of each image or `None` if unknown. + channels: The number of channels per image or `None` if unknown. + + Returns: + Either a placeholder `Tensor` of dimension + [batch_size, height, width, channels] if any of the inputs are `None` or a + constant `Tensor` with the mesh grid values along the spatial dimensions. + """ + if None in [batch_size, height, width, channels]: + return tf.placeholder(tf.float32, (batch_size, height, width, channels)) + else: + return tf.to_float( + np.tile( + np.reshape( + np.reshape(np.arange(height), [height, 1]) + + np.reshape(np.arange(width), [1, width]), + [1, height, width, 1]), + [batch_size, 1, 1, channels])) + + +class ResnetUtilsTest(tf.test.TestCase): + + def testSubsampleThreeByThree(self): + x = tf.reshape(tf.to_float(tf.range(9)), [1, 3, 3, 1]) + x = resnet_utils.subsample(x, 2) + expected = tf.reshape(tf.constant([0, 2, 6, 8]), [1, 2, 2, 1]) + with self.test_session(): + self.assertAllClose(x.eval(), expected.eval()) + + def testSubsampleFourByFour(self): + x = tf.reshape(tf.to_float(tf.range(16)), [1, 4, 4, 1]) + x = resnet_utils.subsample(x, 2) + expected = tf.reshape(tf.constant([0, 2, 8, 10]), [1, 2, 2, 1]) + with self.test_session(): + self.assertAllClose(x.eval(), expected.eval()) + + def testConv2DSameEven(self): + n, n2 = 4, 2 + + # Input image. + x = create_test_input(1, n, n, 1) + + # Convolution kernel. + w = create_test_input(1, 3, 3, 1) + w = tf.reshape(w, [3, 3, 1, 1]) + + tf.get_variable('Conv/weights', initializer=w) + tf.get_variable('Conv/biases', initializer=tf.zeros([1])) + tf.get_variable_scope().reuse_variables() + + y1 = slim.conv2d(x, 1, [3, 3], stride=1, scope='Conv') + y1_expected = tf.to_float([[14, 28, 43, 26], + [28, 48, 66, 37], + [43, 66, 84, 46], + [26, 37, 46, 22]]) + y1_expected = tf.reshape(y1_expected, [1, n, n, 1]) + + y2 = resnet_utils.subsample(y1, 2) + y2_expected = tf.to_float([[14, 43], + [43, 84]]) + y2_expected = tf.reshape(y2_expected, [1, n2, n2, 1]) + + y3 = resnet_utils.conv2d_same(x, 1, 3, stride=2, scope='Conv') + y3_expected = y2_expected + + y4 = slim.conv2d(x, 1, [3, 3], stride=2, scope='Conv') + y4_expected = tf.to_float([[48, 37], + [37, 22]]) + y4_expected = tf.reshape(y4_expected, [1, n2, n2, 1]) + + with self.test_session() as sess: + sess.run(tf.initialize_all_variables()) + self.assertAllClose(y1.eval(), y1_expected.eval()) + self.assertAllClose(y2.eval(), y2_expected.eval()) + self.assertAllClose(y3.eval(), y3_expected.eval()) + self.assertAllClose(y4.eval(), y4_expected.eval()) + + def testConv2DSameOdd(self): + n, n2 = 5, 3 + + # Input image. + x = create_test_input(1, n, n, 1) + + # Convolution kernel. + w = create_test_input(1, 3, 3, 1) + w = tf.reshape(w, [3, 3, 1, 1]) + + tf.get_variable('Conv/weights', initializer=w) + tf.get_variable('Conv/biases', initializer=tf.zeros([1])) + tf.get_variable_scope().reuse_variables() + + y1 = slim.conv2d(x, 1, [3, 3], stride=1, scope='Conv') + y1_expected = tf.to_float([[14, 28, 43, 58, 34], + [28, 48, 66, 84, 46], + [43, 66, 84, 102, 55], + [58, 84, 102, 120, 64], + [34, 46, 55, 64, 30]]) + y1_expected = tf.reshape(y1_expected, [1, n, n, 1]) + + y2 = resnet_utils.subsample(y1, 2) + y2_expected = tf.to_float([[14, 43, 34], + [43, 84, 55], + [34, 55, 30]]) + y2_expected = tf.reshape(y2_expected, [1, n2, n2, 1]) + + y3 = resnet_utils.conv2d_same(x, 1, 3, stride=2, scope='Conv') + y3_expected = y2_expected + + y4 = slim.conv2d(x, 1, [3, 3], stride=2, scope='Conv') + y4_expected = y2_expected + + with self.test_session() as sess: + sess.run(tf.initialize_all_variables()) + self.assertAllClose(y1.eval(), y1_expected.eval()) + self.assertAllClose(y2.eval(), y2_expected.eval()) + self.assertAllClose(y3.eval(), y3_expected.eval()) + self.assertAllClose(y4.eval(), y4_expected.eval()) + + def _resnet_plain(self, inputs, blocks, output_stride=None, scope=None): + """A plain ResNet without extra layers before or after the ResNet blocks.""" + with tf.variable_scope(scope, values=[inputs]): + with slim.arg_scope([slim.conv2d], outputs_collections='end_points'): + net = resnet_utils.stack_blocks_dense(inputs, blocks, output_stride) + end_points = dict(tf.get_collection('end_points')) + return net, end_points + + def testEndPointsV1(self): + """Test the end points of a tiny v1 bottleneck network.""" + bottleneck = resnet_v1.bottleneck + blocks = [resnet_utils.Block('block1', bottleneck, [(4, 1, 1), (4, 1, 2)]), + resnet_utils.Block('block2', bottleneck, [(8, 2, 1), (8, 2, 1)])] + inputs = create_test_input(2, 32, 16, 3) + with slim.arg_scope(resnet_utils.resnet_arg_scope()): + _, end_points = self._resnet_plain(inputs, blocks, scope='tiny') + expected = [ + 'tiny/block1/unit_1/bottleneck_v1/shortcut', + 'tiny/block1/unit_1/bottleneck_v1/conv1', + 'tiny/block1/unit_1/bottleneck_v1/conv2', + 'tiny/block1/unit_1/bottleneck_v1/conv3', + 'tiny/block1/unit_2/bottleneck_v1/conv1', + 'tiny/block1/unit_2/bottleneck_v1/conv2', + 'tiny/block1/unit_2/bottleneck_v1/conv3', + 'tiny/block2/unit_1/bottleneck_v1/shortcut', + 'tiny/block2/unit_1/bottleneck_v1/conv1', + 'tiny/block2/unit_1/bottleneck_v1/conv2', + 'tiny/block2/unit_1/bottleneck_v1/conv3', + 'tiny/block2/unit_2/bottleneck_v1/conv1', + 'tiny/block2/unit_2/bottleneck_v1/conv2', + 'tiny/block2/unit_2/bottleneck_v1/conv3'] + self.assertItemsEqual(expected, end_points) + + def _stack_blocks_nondense(self, net, blocks): + """A simplified ResNet Block stacker without output stride control.""" + for block in blocks: + with tf.variable_scope(block.scope, 'block', [net]): + for i, unit in enumerate(block.args): + depth, depth_bottleneck, stride = unit + with tf.variable_scope('unit_%d' % (i + 1), values=[net]): + net = block.unit_fn(net, + depth=depth, + depth_bottleneck=depth_bottleneck, + stride=stride, + rate=1) + return net + + def _atrousValues(self, bottleneck): + """Verify the values of dense feature extraction by atrous convolution. + + Make sure that dense feature extraction by stack_blocks_dense() followed by + subsampling gives identical results to feature extraction at the nominal + network output stride using the simple self._stack_blocks_nondense() above. + + Args: + bottleneck: The bottleneck function. + """ + blocks = [ + resnet_utils.Block('block1', bottleneck, [(4, 1, 1), (4, 1, 2)]), + resnet_utils.Block('block2', bottleneck, [(8, 2, 1), (8, 2, 2)]), + resnet_utils.Block('block3', bottleneck, [(16, 4, 1), (16, 4, 2)]), + resnet_utils.Block('block4', bottleneck, [(32, 8, 1), (32, 8, 1)]) + ] + nominal_stride = 8 + + # Test both odd and even input dimensions. + height = 30 + width = 31 + with slim.arg_scope(resnet_utils.resnet_arg_scope()): + with slim.arg_scope([slim.batch_norm], is_training=False): + for output_stride in [1, 2, 4, 8, None]: + with tf.Graph().as_default(): + with self.test_session() as sess: + tf.set_random_seed(0) + inputs = create_test_input(1, height, width, 3) + # Dense feature extraction followed by subsampling. + output = resnet_utils.stack_blocks_dense(inputs, + blocks, + output_stride) + if output_stride is None: + factor = 1 + else: + factor = nominal_stride // output_stride + + output = resnet_utils.subsample(output, factor) + # Make the two networks use the same weights. + tf.get_variable_scope().reuse_variables() + # Feature extraction at the nominal network rate. + expected = self._stack_blocks_nondense(inputs, blocks) + sess.run(tf.initialize_all_variables()) + output, expected = sess.run([output, expected]) + self.assertAllClose(output, expected, atol=1e-4, rtol=1e-4) + + def testAtrousValuesBottleneck(self): + self._atrousValues(resnet_v1.bottleneck) + + +class ResnetCompleteNetworkTest(tf.test.TestCase): + """Tests with complete small ResNet v1 networks.""" + + def _resnet_small(self, + inputs, + num_classes=None, + is_training=True, + global_pool=True, + output_stride=None, + include_root_block=True, + reuse=None, + scope='resnet_v1_small'): + """A shallow and thin ResNet v1 for faster tests.""" + bottleneck = resnet_v1.bottleneck + blocks = [ + resnet_utils.Block( + 'block1', bottleneck, [(4, 1, 1)] * 2 + [(4, 1, 2)]), + resnet_utils.Block( + 'block2', bottleneck, [(8, 2, 1)] * 2 + [(8, 2, 2)]), + resnet_utils.Block( + 'block3', bottleneck, [(16, 4, 1)] * 2 + [(16, 4, 2)]), + resnet_utils.Block( + 'block4', bottleneck, [(32, 8, 1)] * 2)] + return resnet_v1.resnet_v1(inputs, blocks, num_classes, + is_training=is_training, + global_pool=global_pool, + output_stride=output_stride, + include_root_block=include_root_block, + reuse=reuse, + scope=scope) + + def testClassificationEndPoints(self): + global_pool = True + num_classes = 10 + inputs = create_test_input(2, 224, 224, 3) + with slim.arg_scope(resnet_utils.resnet_arg_scope()): + logits, end_points = self._resnet_small(inputs, num_classes, + global_pool=global_pool, + scope='resnet') + self.assertTrue(logits.op.name.startswith('resnet/logits')) + self.assertListEqual(logits.get_shape().as_list(), [2, 1, 1, num_classes]) + self.assertTrue('predictions' in end_points) + self.assertListEqual(end_points['predictions'].get_shape().as_list(), + [2, 1, 1, num_classes]) + + def testClassificationShapes(self): + global_pool = True + num_classes = 10 + inputs = create_test_input(2, 224, 224, 3) + with slim.arg_scope(resnet_utils.resnet_arg_scope()): + _, end_points = self._resnet_small(inputs, num_classes, + global_pool=global_pool, + scope='resnet') + endpoint_to_shape = { + 'resnet/block1': [2, 28, 28, 4], + 'resnet/block2': [2, 14, 14, 8], + 'resnet/block3': [2, 7, 7, 16], + 'resnet/block4': [2, 7, 7, 32]} + for endpoint in endpoint_to_shape: + shape = endpoint_to_shape[endpoint] + self.assertListEqual(end_points[endpoint].get_shape().as_list(), shape) + + def testFullyConvolutionalEndpointShapes(self): + global_pool = False + num_classes = 10 + inputs = create_test_input(2, 321, 321, 3) + with slim.arg_scope(resnet_utils.resnet_arg_scope()): + _, end_points = self._resnet_small(inputs, num_classes, + global_pool=global_pool, + scope='resnet') + endpoint_to_shape = { + 'resnet/block1': [2, 41, 41, 4], + 'resnet/block2': [2, 21, 21, 8], + 'resnet/block3': [2, 11, 11, 16], + 'resnet/block4': [2, 11, 11, 32]} + for endpoint in endpoint_to_shape: + shape = endpoint_to_shape[endpoint] + self.assertListEqual(end_points[endpoint].get_shape().as_list(), shape) + + def testRootlessFullyConvolutionalEndpointShapes(self): + global_pool = False + num_classes = 10 + inputs = create_test_input(2, 128, 128, 3) + with slim.arg_scope(resnet_utils.resnet_arg_scope()): + _, end_points = self._resnet_small(inputs, num_classes, + global_pool=global_pool, + include_root_block=False, + scope='resnet') + endpoint_to_shape = { + 'resnet/block1': [2, 64, 64, 4], + 'resnet/block2': [2, 32, 32, 8], + 'resnet/block3': [2, 16, 16, 16], + 'resnet/block4': [2, 16, 16, 32]} + for endpoint in endpoint_to_shape: + shape = endpoint_to_shape[endpoint] + self.assertListEqual(end_points[endpoint].get_shape().as_list(), shape) + + def testAtrousFullyConvolutionalEndpointShapes(self): + global_pool = False + num_classes = 10 + output_stride = 8 + inputs = create_test_input(2, 321, 321, 3) + with slim.arg_scope(resnet_utils.resnet_arg_scope()): + _, end_points = self._resnet_small(inputs, + num_classes, + global_pool=global_pool, + output_stride=output_stride, + scope='resnet') + endpoint_to_shape = { + 'resnet/block1': [2, 41, 41, 4], + 'resnet/block2': [2, 41, 41, 8], + 'resnet/block3': [2, 41, 41, 16], + 'resnet/block4': [2, 41, 41, 32]} + for endpoint in endpoint_to_shape: + shape = endpoint_to_shape[endpoint] + self.assertListEqual(end_points[endpoint].get_shape().as_list(), shape) + + def testAtrousFullyConvolutionalValues(self): + """Verify dense feature extraction with atrous convolution.""" + nominal_stride = 32 + for output_stride in [4, 8, 16, 32, None]: + with slim.arg_scope(resnet_utils.resnet_arg_scope()): + with tf.Graph().as_default(): + with self.test_session() as sess: + tf.set_random_seed(0) + inputs = create_test_input(2, 81, 81, 3) + # Dense feature extraction followed by subsampling. + output, _ = self._resnet_small(inputs, None, is_training=False, + global_pool=False, + output_stride=output_stride) + if output_stride is None: + factor = 1 + else: + factor = nominal_stride // output_stride + output = resnet_utils.subsample(output, factor) + # Make the two networks use the same weights. + tf.get_variable_scope().reuse_variables() + # Feature extraction at the nominal network rate. + expected, _ = self._resnet_small(inputs, None, is_training=False, + global_pool=False) + sess.run(tf.initialize_all_variables()) + self.assertAllClose(output.eval(), expected.eval(), + atol=1e-4, rtol=1e-4) + + def testUnknownBatchSize(self): + batch = 2 + height, width = 65, 65 + global_pool = True + num_classes = 10 + inputs = create_test_input(None, height, width, 3) + with slim.arg_scope(resnet_utils.resnet_arg_scope()): + logits, _ = self._resnet_small(inputs, num_classes, + global_pool=global_pool, + scope='resnet') + self.assertTrue(logits.op.name.startswith('resnet/logits')) + self.assertListEqual(logits.get_shape().as_list(), + [None, 1, 1, num_classes]) + images = create_test_input(batch, height, width, 3) + with self.test_session() as sess: + sess.run(tf.initialize_all_variables()) + output = sess.run(logits, {inputs: images.eval()}) + self.assertEqual(output.shape, (batch, 1, 1, num_classes)) + + def testFullyConvolutionalUnknownHeightWidth(self): + batch = 2 + height, width = 65, 65 + global_pool = False + inputs = create_test_input(batch, None, None, 3) + with slim.arg_scope(resnet_utils.resnet_arg_scope()): + output, _ = self._resnet_small(inputs, None, global_pool=global_pool) + self.assertListEqual(output.get_shape().as_list(), + [batch, None, None, 32]) + images = create_test_input(batch, height, width, 3) + with self.test_session() as sess: + sess.run(tf.initialize_all_variables()) + output = sess.run(output, {inputs: images.eval()}) + self.assertEqual(output.shape, (batch, 3, 3, 32)) + + def testAtrousFullyConvolutionalUnknownHeightWidth(self): + batch = 2 + height, width = 65, 65 + global_pool = False + output_stride = 8 + inputs = create_test_input(batch, None, None, 3) + with slim.arg_scope(resnet_utils.resnet_arg_scope()): + output, _ = self._resnet_small(inputs, + None, + global_pool=global_pool, + output_stride=output_stride) + self.assertListEqual(output.get_shape().as_list(), + [batch, None, None, 32]) + images = create_test_input(batch, height, width, 3) + with self.test_session() as sess: + sess.run(tf.initialize_all_variables()) + output = sess.run(output, {inputs: images.eval()}) + self.assertEqual(output.shape, (batch, 9, 9, 32)) + + +if __name__ == '__main__': + tf.test.main() diff --git a/nets/resnet_v2.py b/nets/resnet_v2.py new file mode 100644 index 0000000..e8b7d30 --- /dev/null +++ b/nets/resnet_v2.py @@ -0,0 +1,302 @@ +# Copyright 2016 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Contains definitions for the preactivation form of Residual Networks. + +Residual networks (ResNets) were originally proposed in: +[1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun + Deep Residual Learning for Image Recognition. arXiv:1512.03385 + +The full preactivation 'v2' ResNet variant implemented in this module was +introduced by: +[2] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun + Identity Mappings in Deep Residual Networks. arXiv: 1603.05027 + +The key difference of the full preactivation 'v2' variant compared to the +'v1' variant in [1] is the use of batch normalization before every weight layer. +Another difference is that 'v2' ResNets do not include an activation function in +the main pathway. Also see [2; Fig. 4e]. + +Typical use: + + from tensorflow.contrib.slim.nets import resnet_v2 + +ResNet-101 for image classification into 1000 classes: + + # inputs has shape [batch, 224, 224, 3] + with slim.arg_scope(resnet_v2.resnet_arg_scope()): + net, end_points = resnet_v2.resnet_v2_101(inputs, 1000, is_training=False) + +ResNet-101 for semantic segmentation into 21 classes: + + # inputs has shape [batch, 513, 513, 3] + with slim.arg_scope(resnet_v2.resnet_arg_scope(is_training)): + net, end_points = resnet_v2.resnet_v2_101(inputs, + 21, + is_training=False, + global_pool=False, + output_stride=16) +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import tensorflow as tf + +from nets import resnet_utils + +slim = tf.contrib.slim +resnet_arg_scope = resnet_utils.resnet_arg_scope + + +@slim.add_arg_scope +def bottleneck(inputs, depth, depth_bottleneck, stride, rate=1, + outputs_collections=None, scope=None): + """Bottleneck residual unit variant with BN before convolutions. + + This is the full preactivation residual unit variant proposed in [2]. See + Fig. 1(b) of [2] for its definition. Note that we use here the bottleneck + variant which has an extra bottleneck layer. + + When putting together two consecutive ResNet blocks that use this unit, one + should use stride = 2 in the last unit of the first block. + + Args: + inputs: A tensor of size [batch, height, width, channels]. + depth: The depth of the ResNet unit output. + depth_bottleneck: The depth of the bottleneck layers. + stride: The ResNet unit's stride. Determines the amount of downsampling of + the units output compared to its input. + rate: An integer, rate for atrous convolution. + outputs_collections: Collection to add the ResNet unit output. + scope: Optional variable_scope. + + Returns: + The ResNet unit's output. + """ + with tf.variable_scope(scope, 'bottleneck_v2', [inputs]) as sc: + depth_in = slim.utils.last_dimension(inputs.get_shape(), min_rank=4) + preact = slim.batch_norm(inputs, activation_fn=tf.nn.relu, scope='preact') + if depth == depth_in: + shortcut = resnet_utils.subsample(inputs, stride, 'shortcut') + else: + shortcut = slim.conv2d(preact, depth, [1, 1], stride=stride, + normalizer_fn=None, activation_fn=None, + scope='shortcut') + + residual = slim.conv2d(preact, depth_bottleneck, [1, 1], stride=1, + scope='conv1') + residual = resnet_utils.conv2d_same(residual, depth_bottleneck, 3, stride, + rate=rate, scope='conv2') + residual = slim.conv2d(residual, depth, [1, 1], stride=1, + normalizer_fn=None, activation_fn=None, + scope='conv3') + + output = shortcut + residual + + return slim.utils.collect_named_outputs(outputs_collections, + sc.original_name_scope, + output) + + +def resnet_v2(inputs, + blocks, + num_classes=None, + is_training=True, + global_pool=True, + output_stride=None, + include_root_block=True, + reuse=None, + scope=None): + """Generator for v2 (preactivation) ResNet models. + + This function generates a family of ResNet v2 models. See the resnet_v2_*() + methods for specific model instantiations, obtained by selecting different + block instantiations that produce ResNets of various depths. + + Training for image classification on Imagenet is usually done with [224, 224] + inputs, resulting in [7, 7] feature maps at the output of the last ResNet + block for the ResNets defined in [1] that have nominal stride equal to 32. + However, for dense prediction tasks we advise that one uses inputs with + spatial dimensions that are multiples of 32 plus 1, e.g., [321, 321]. In + this case the feature maps at the ResNet output will have spatial shape + [(height - 1) / output_stride + 1, (width - 1) / output_stride + 1] + and corners exactly aligned with the input image corners, which greatly + facilitates alignment of the features to the image. Using as input [225, 225] + images results in [8, 8] feature maps at the output of the last ResNet block. + + For dense prediction tasks, the ResNet needs to run in fully-convolutional + (FCN) mode and global_pool needs to be set to False. The ResNets in [1, 2] all + have nominal stride equal to 32 and a good choice in FCN mode is to use + output_stride=16 in order to increase the density of the computed features at + small computational and memory overhead, cf. http://arxiv.org/abs/1606.00915. + + Args: + inputs: A tensor of size [batch, height_in, width_in, channels]. + blocks: A list of length equal to the number of ResNet blocks. Each element + is a resnet_utils.Block object describing the units in the block. + num_classes: Number of predicted classes for classification tasks. If None + we return the features before the logit layer. + is_training: whether is training or not. + global_pool: If True, we perform global average pooling before computing the + logits. Set to True for image classification, False for dense prediction. + output_stride: If None, then the output will be computed at the nominal + network stride. If output_stride is not None, it specifies the requested + ratio of input to output spatial resolution. + include_root_block: If True, include the initial convolution followed by + max-pooling, if False excludes it. If excluded, `inputs` should be the + results of an activation-less convolution. + reuse: whether or not the network and its variables should be reused. To be + able to reuse 'scope' must be given. + scope: Optional variable_scope. + + + Returns: + net: A rank-4 tensor of size [batch, height_out, width_out, channels_out]. + If global_pool is False, then height_out and width_out are reduced by a + factor of output_stride compared to the respective height_in and width_in, + else both height_out and width_out equal one. If num_classes is None, then + net is the output of the last ResNet block, potentially after global + average pooling. If num_classes is not None, net contains the pre-softmax + activations. + end_points: A dictionary from components of the network to the corresponding + activation. + + Raises: + ValueError: If the target output_stride is not valid. + """ + with tf.variable_scope(scope, 'resnet_v2', [inputs], reuse=reuse) as sc: + end_points_collection = sc.name + '_end_points' + with slim.arg_scope([slim.conv2d, bottleneck, + resnet_utils.stack_blocks_dense], + outputs_collections=end_points_collection): + with slim.arg_scope([slim.batch_norm], is_training=is_training): + net = inputs + if include_root_block: + if output_stride is not None: + if output_stride % 4 != 0: + raise ValueError('The output_stride needs to be a multiple of 4.') + output_stride /= 4 + # We do not include batch normalization or activation functions in + # conv1 because the first ResNet unit will perform these. Cf. + # Appendix of [2]. + with slim.arg_scope([slim.conv2d], + activation_fn=None, normalizer_fn=None): + net = resnet_utils.conv2d_same(net, 64, 7, stride=2, scope='conv1') + net = slim.max_pool2d(net, [3, 3], stride=2, scope='pool1') + net = resnet_utils.stack_blocks_dense(net, blocks, output_stride) + # This is needed because the pre-activation variant does not have batch + # normalization or activation functions in the residual unit output. See + # Appendix of [2]. + net = slim.batch_norm(net, activation_fn=tf.nn.relu, scope='postnorm') + if global_pool: + # Global average pooling. + net = tf.reduce_mean(net, [1, 2], name='pool5', keep_dims=True) + if num_classes is not None: + net = slim.conv2d(net, num_classes, [1, 1], activation_fn=None, + normalizer_fn=None, scope='logits') + # Convert end_points_collection into a dictionary of end_points. + end_points = dict(tf.get_collection(end_points_collection)) + if num_classes is not None: + end_points['predictions'] = slim.softmax(net, scope='predictions') + return net, end_points +resnet_v2.default_image_size = 224 + + +def resnet_v2_50(inputs, + num_classes=None, + is_training=True, + global_pool=True, + output_stride=None, + reuse=None, + scope='resnet_v2_50'): + """ResNet-50 model of [1]. See resnet_v2() for arg and return description.""" + blocks = [ + resnet_utils.Block( + 'block1', bottleneck, [(256, 64, 1)] * 2 + [(256, 64, 2)]), + resnet_utils.Block( + 'block2', bottleneck, [(512, 128, 1)] * 3 + [(512, 128, 2)]), + resnet_utils.Block( + 'block3', bottleneck, [(1024, 256, 1)] * 5 + [(1024, 256, 2)]), + resnet_utils.Block( + 'block4', bottleneck, [(2048, 512, 1)] * 3)] + return resnet_v2(inputs, blocks, num_classes, is_training=is_training, + global_pool=global_pool, output_stride=output_stride, + include_root_block=True, reuse=reuse, scope=scope) + + +def resnet_v2_101(inputs, + num_classes=None, + is_training=True, + global_pool=True, + output_stride=None, + reuse=None, + scope='resnet_v2_101'): + """ResNet-101 model of [1]. See resnet_v2() for arg and return description.""" + blocks = [ + resnet_utils.Block( + 'block1', bottleneck, [(256, 64, 1)] * 2 + [(256, 64, 2)]), + resnet_utils.Block( + 'block2', bottleneck, [(512, 128, 1)] * 3 + [(512, 128, 2)]), + resnet_utils.Block( + 'block3', bottleneck, [(1024, 256, 1)] * 22 + [(1024, 256, 2)]), + resnet_utils.Block( + 'block4', bottleneck, [(2048, 512, 1)] * 3)] + return resnet_v2(inputs, blocks, num_classes, is_training=is_training, + global_pool=global_pool, output_stride=output_stride, + include_root_block=True, reuse=reuse, scope=scope) + + +def resnet_v2_152(inputs, + num_classes=None, + is_training=True, + global_pool=True, + output_stride=None, + reuse=None, + scope='resnet_v2_152'): + """ResNet-152 model of [1]. See resnet_v2() for arg and return description.""" + blocks = [ + resnet_utils.Block( + 'block1', bottleneck, [(256, 64, 1)] * 2 + [(256, 64, 2)]), + resnet_utils.Block( + 'block2', bottleneck, [(512, 128, 1)] * 7 + [(512, 128, 2)]), + resnet_utils.Block( + 'block3', bottleneck, [(1024, 256, 1)] * 35 + [(1024, 256, 2)]), + resnet_utils.Block( + 'block4', bottleneck, [(2048, 512, 1)] * 3)] + return resnet_v2(inputs, blocks, num_classes, is_training=is_training, + global_pool=global_pool, output_stride=output_stride, + include_root_block=True, reuse=reuse, scope=scope) + + +def resnet_v2_200(inputs, + num_classes=None, + is_training=True, + global_pool=True, + output_stride=None, + reuse=None, + scope='resnet_v2_200'): + """ResNet-200 model of [2]. See resnet_v2() for arg and return description.""" + blocks = [ + resnet_utils.Block( + 'block1', bottleneck, [(256, 64, 1)] * 2 + [(256, 64, 2)]), + resnet_utils.Block( + 'block2', bottleneck, [(512, 128, 1)] * 23 + [(512, 128, 2)]), + resnet_utils.Block( + 'block3', bottleneck, [(1024, 256, 1)] * 35 + [(1024, 256, 2)]), + resnet_utils.Block( + 'block4', bottleneck, [(2048, 512, 1)] * 3)] + return resnet_v2(inputs, blocks, num_classes, is_training=is_training, + global_pool=global_pool, output_stride=output_stride, + include_root_block=True, reuse=reuse, scope=scope) diff --git a/nets/resnet_v2_test.py b/nets/resnet_v2_test.py new file mode 100644 index 0000000..1b4fa75 --- /dev/null +++ b/nets/resnet_v2_test.py @@ -0,0 +1,453 @@ +# Copyright 2016 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for slim.nets.resnet_v2.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +import tensorflow as tf + +from nets import resnet_utils +from nets import resnet_v2 + +slim = tf.contrib.slim + + +def create_test_input(batch_size, height, width, channels): + """Create test input tensor. + + Args: + batch_size: The number of images per batch or `None` if unknown. + height: The height of each image or `None` if unknown. + width: The width of each image or `None` if unknown. + channels: The number of channels per image or `None` if unknown. + + Returns: + Either a placeholder `Tensor` of dimension + [batch_size, height, width, channels] if any of the inputs are `None` or a + constant `Tensor` with the mesh grid values along the spatial dimensions. + """ + if None in [batch_size, height, width, channels]: + return tf.placeholder(tf.float32, (batch_size, height, width, channels)) + else: + return tf.to_float( + np.tile( + np.reshape( + np.reshape(np.arange(height), [height, 1]) + + np.reshape(np.arange(width), [1, width]), + [1, height, width, 1]), + [batch_size, 1, 1, channels])) + + +class ResnetUtilsTest(tf.test.TestCase): + + def testSubsampleThreeByThree(self): + x = tf.reshape(tf.to_float(tf.range(9)), [1, 3, 3, 1]) + x = resnet_utils.subsample(x, 2) + expected = tf.reshape(tf.constant([0, 2, 6, 8]), [1, 2, 2, 1]) + with self.test_session(): + self.assertAllClose(x.eval(), expected.eval()) + + def testSubsampleFourByFour(self): + x = tf.reshape(tf.to_float(tf.range(16)), [1, 4, 4, 1]) + x = resnet_utils.subsample(x, 2) + expected = tf.reshape(tf.constant([0, 2, 8, 10]), [1, 2, 2, 1]) + with self.test_session(): + self.assertAllClose(x.eval(), expected.eval()) + + def testConv2DSameEven(self): + n, n2 = 4, 2 + + # Input image. + x = create_test_input(1, n, n, 1) + + # Convolution kernel. + w = create_test_input(1, 3, 3, 1) + w = tf.reshape(w, [3, 3, 1, 1]) + + tf.get_variable('Conv/weights', initializer=w) + tf.get_variable('Conv/biases', initializer=tf.zeros([1])) + tf.get_variable_scope().reuse_variables() + + y1 = slim.conv2d(x, 1, [3, 3], stride=1, scope='Conv') + y1_expected = tf.to_float([[14, 28, 43, 26], + [28, 48, 66, 37], + [43, 66, 84, 46], + [26, 37, 46, 22]]) + y1_expected = tf.reshape(y1_expected, [1, n, n, 1]) + + y2 = resnet_utils.subsample(y1, 2) + y2_expected = tf.to_float([[14, 43], + [43, 84]]) + y2_expected = tf.reshape(y2_expected, [1, n2, n2, 1]) + + y3 = resnet_utils.conv2d_same(x, 1, 3, stride=2, scope='Conv') + y3_expected = y2_expected + + y4 = slim.conv2d(x, 1, [3, 3], stride=2, scope='Conv') + y4_expected = tf.to_float([[48, 37], + [37, 22]]) + y4_expected = tf.reshape(y4_expected, [1, n2, n2, 1]) + + with self.test_session() as sess: + sess.run(tf.initialize_all_variables()) + self.assertAllClose(y1.eval(), y1_expected.eval()) + self.assertAllClose(y2.eval(), y2_expected.eval()) + self.assertAllClose(y3.eval(), y3_expected.eval()) + self.assertAllClose(y4.eval(), y4_expected.eval()) + + def testConv2DSameOdd(self): + n, n2 = 5, 3 + + # Input image. + x = create_test_input(1, n, n, 1) + + # Convolution kernel. + w = create_test_input(1, 3, 3, 1) + w = tf.reshape(w, [3, 3, 1, 1]) + + tf.get_variable('Conv/weights', initializer=w) + tf.get_variable('Conv/biases', initializer=tf.zeros([1])) + tf.get_variable_scope().reuse_variables() + + y1 = slim.conv2d(x, 1, [3, 3], stride=1, scope='Conv') + y1_expected = tf.to_float([[14, 28, 43, 58, 34], + [28, 48, 66, 84, 46], + [43, 66, 84, 102, 55], + [58, 84, 102, 120, 64], + [34, 46, 55, 64, 30]]) + y1_expected = tf.reshape(y1_expected, [1, n, n, 1]) + + y2 = resnet_utils.subsample(y1, 2) + y2_expected = tf.to_float([[14, 43, 34], + [43, 84, 55], + [34, 55, 30]]) + y2_expected = tf.reshape(y2_expected, [1, n2, n2, 1]) + + y3 = resnet_utils.conv2d_same(x, 1, 3, stride=2, scope='Conv') + y3_expected = y2_expected + + y4 = slim.conv2d(x, 1, [3, 3], stride=2, scope='Conv') + y4_expected = y2_expected + + with self.test_session() as sess: + sess.run(tf.initialize_all_variables()) + self.assertAllClose(y1.eval(), y1_expected.eval()) + self.assertAllClose(y2.eval(), y2_expected.eval()) + self.assertAllClose(y3.eval(), y3_expected.eval()) + self.assertAllClose(y4.eval(), y4_expected.eval()) + + def _resnet_plain(self, inputs, blocks, output_stride=None, scope=None): + """A plain ResNet without extra layers before or after the ResNet blocks.""" + with tf.variable_scope(scope, values=[inputs]): + with slim.arg_scope([slim.conv2d], outputs_collections='end_points'): + net = resnet_utils.stack_blocks_dense(inputs, blocks, output_stride) + end_points = dict(tf.get_collection('end_points')) + return net, end_points + + def testEndPointsV2(self): + """Test the end points of a tiny v2 bottleneck network.""" + bottleneck = resnet_v2.bottleneck + blocks = [resnet_utils.Block('block1', bottleneck, [(4, 1, 1), (4, 1, 2)]), + resnet_utils.Block('block2', bottleneck, [(8, 2, 1), (8, 2, 1)])] + inputs = create_test_input(2, 32, 16, 3) + with slim.arg_scope(resnet_utils.resnet_arg_scope()): + _, end_points = self._resnet_plain(inputs, blocks, scope='tiny') + expected = [ + 'tiny/block1/unit_1/bottleneck_v2/shortcut', + 'tiny/block1/unit_1/bottleneck_v2/conv1', + 'tiny/block1/unit_1/bottleneck_v2/conv2', + 'tiny/block1/unit_1/bottleneck_v2/conv3', + 'tiny/block1/unit_2/bottleneck_v2/conv1', + 'tiny/block1/unit_2/bottleneck_v2/conv2', + 'tiny/block1/unit_2/bottleneck_v2/conv3', + 'tiny/block2/unit_1/bottleneck_v2/shortcut', + 'tiny/block2/unit_1/bottleneck_v2/conv1', + 'tiny/block2/unit_1/bottleneck_v2/conv2', + 'tiny/block2/unit_1/bottleneck_v2/conv3', + 'tiny/block2/unit_2/bottleneck_v2/conv1', + 'tiny/block2/unit_2/bottleneck_v2/conv2', + 'tiny/block2/unit_2/bottleneck_v2/conv3'] + self.assertItemsEqual(expected, end_points) + + def _stack_blocks_nondense(self, net, blocks): + """A simplified ResNet Block stacker without output stride control.""" + for block in blocks: + with tf.variable_scope(block.scope, 'block', [net]): + for i, unit in enumerate(block.args): + depth, depth_bottleneck, stride = unit + with tf.variable_scope('unit_%d' % (i + 1), values=[net]): + net = block.unit_fn(net, + depth=depth, + depth_bottleneck=depth_bottleneck, + stride=stride, + rate=1) + return net + + def _atrousValues(self, bottleneck): + """Verify the values of dense feature extraction by atrous convolution. + + Make sure that dense feature extraction by stack_blocks_dense() followed by + subsampling gives identical results to feature extraction at the nominal + network output stride using the simple self._stack_blocks_nondense() above. + + Args: + bottleneck: The bottleneck function. + """ + blocks = [ + resnet_utils.Block('block1', bottleneck, [(4, 1, 1), (4, 1, 2)]), + resnet_utils.Block('block2', bottleneck, [(8, 2, 1), (8, 2, 2)]), + resnet_utils.Block('block3', bottleneck, [(16, 4, 1), (16, 4, 2)]), + resnet_utils.Block('block4', bottleneck, [(32, 8, 1), (32, 8, 1)]) + ] + nominal_stride = 8 + + # Test both odd and even input dimensions. + height = 30 + width = 31 + with slim.arg_scope(resnet_utils.resnet_arg_scope()): + with slim.arg_scope([slim.batch_norm], is_training=False): + for output_stride in [1, 2, 4, 8, None]: + with tf.Graph().as_default(): + with self.test_session() as sess: + tf.set_random_seed(0) + inputs = create_test_input(1, height, width, 3) + # Dense feature extraction followed by subsampling. + output = resnet_utils.stack_blocks_dense(inputs, + blocks, + output_stride) + if output_stride is None: + factor = 1 + else: + factor = nominal_stride // output_stride + + output = resnet_utils.subsample(output, factor) + # Make the two networks use the same weights. + tf.get_variable_scope().reuse_variables() + # Feature extraction at the nominal network rate. + expected = self._stack_blocks_nondense(inputs, blocks) + sess.run(tf.initialize_all_variables()) + output, expected = sess.run([output, expected]) + self.assertAllClose(output, expected, atol=1e-4, rtol=1e-4) + + def testAtrousValuesBottleneck(self): + self._atrousValues(resnet_v2.bottleneck) + + +class ResnetCompleteNetworkTest(tf.test.TestCase): + """Tests with complete small ResNet v2 networks.""" + + def _resnet_small(self, + inputs, + num_classes=None, + is_training=True, + global_pool=True, + output_stride=None, + include_root_block=True, + reuse=None, + scope='resnet_v2_small'): + """A shallow and thin ResNet v2 for faster tests.""" + bottleneck = resnet_v2.bottleneck + blocks = [ + resnet_utils.Block( + 'block1', bottleneck, [(4, 1, 1)] * 2 + [(4, 1, 2)]), + resnet_utils.Block( + 'block2', bottleneck, [(8, 2, 1)] * 2 + [(8, 2, 2)]), + resnet_utils.Block( + 'block3', bottleneck, [(16, 4, 1)] * 2 + [(16, 4, 2)]), + resnet_utils.Block( + 'block4', bottleneck, [(32, 8, 1)] * 2)] + return resnet_v2.resnet_v2(inputs, blocks, num_classes, + is_training=is_training, + global_pool=global_pool, + output_stride=output_stride, + include_root_block=include_root_block, + reuse=reuse, + scope=scope) + + def testClassificationEndPoints(self): + global_pool = True + num_classes = 10 + inputs = create_test_input(2, 224, 224, 3) + with slim.arg_scope(resnet_utils.resnet_arg_scope()): + logits, end_points = self._resnet_small(inputs, num_classes, + global_pool=global_pool, + scope='resnet') + self.assertTrue(logits.op.name.startswith('resnet/logits')) + self.assertListEqual(logits.get_shape().as_list(), [2, 1, 1, num_classes]) + self.assertTrue('predictions' in end_points) + self.assertListEqual(end_points['predictions'].get_shape().as_list(), + [2, 1, 1, num_classes]) + + def testClassificationShapes(self): + global_pool = True + num_classes = 10 + inputs = create_test_input(2, 224, 224, 3) + with slim.arg_scope(resnet_utils.resnet_arg_scope()): + _, end_points = self._resnet_small(inputs, num_classes, + global_pool=global_pool, + scope='resnet') + endpoint_to_shape = { + 'resnet/block1': [2, 28, 28, 4], + 'resnet/block2': [2, 14, 14, 8], + 'resnet/block3': [2, 7, 7, 16], + 'resnet/block4': [2, 7, 7, 32]} + for endpoint in endpoint_to_shape: + shape = endpoint_to_shape[endpoint] + self.assertListEqual(end_points[endpoint].get_shape().as_list(), shape) + + def testFullyConvolutionalEndpointShapes(self): + global_pool = False + num_classes = 10 + inputs = create_test_input(2, 321, 321, 3) + with slim.arg_scope(resnet_utils.resnet_arg_scope()): + _, end_points = self._resnet_small(inputs, num_classes, + global_pool=global_pool, + scope='resnet') + endpoint_to_shape = { + 'resnet/block1': [2, 41, 41, 4], + 'resnet/block2': [2, 21, 21, 8], + 'resnet/block3': [2, 11, 11, 16], + 'resnet/block4': [2, 11, 11, 32]} + for endpoint in endpoint_to_shape: + shape = endpoint_to_shape[endpoint] + self.assertListEqual(end_points[endpoint].get_shape().as_list(), shape) + + def testRootlessFullyConvolutionalEndpointShapes(self): + global_pool = False + num_classes = 10 + inputs = create_test_input(2, 128, 128, 3) + with slim.arg_scope(resnet_utils.resnet_arg_scope()): + _, end_points = self._resnet_small(inputs, num_classes, + global_pool=global_pool, + include_root_block=False, + scope='resnet') + endpoint_to_shape = { + 'resnet/block1': [2, 64, 64, 4], + 'resnet/block2': [2, 32, 32, 8], + 'resnet/block3': [2, 16, 16, 16], + 'resnet/block4': [2, 16, 16, 32]} + for endpoint in endpoint_to_shape: + shape = endpoint_to_shape[endpoint] + self.assertListEqual(end_points[endpoint].get_shape().as_list(), shape) + + def testAtrousFullyConvolutionalEndpointShapes(self): + global_pool = False + num_classes = 10 + output_stride = 8 + inputs = create_test_input(2, 321, 321, 3) + with slim.arg_scope(resnet_utils.resnet_arg_scope()): + _, end_points = self._resnet_small(inputs, + num_classes, + global_pool=global_pool, + output_stride=output_stride, + scope='resnet') + endpoint_to_shape = { + 'resnet/block1': [2, 41, 41, 4], + 'resnet/block2': [2, 41, 41, 8], + 'resnet/block3': [2, 41, 41, 16], + 'resnet/block4': [2, 41, 41, 32]} + for endpoint in endpoint_to_shape: + shape = endpoint_to_shape[endpoint] + self.assertListEqual(end_points[endpoint].get_shape().as_list(), shape) + + def testAtrousFullyConvolutionalValues(self): + """Verify dense feature extraction with atrous convolution.""" + nominal_stride = 32 + for output_stride in [4, 8, 16, 32, None]: + with slim.arg_scope(resnet_utils.resnet_arg_scope()): + with tf.Graph().as_default(): + with self.test_session() as sess: + tf.set_random_seed(0) + inputs = create_test_input(2, 81, 81, 3) + # Dense feature extraction followed by subsampling. + output, _ = self._resnet_small(inputs, None, + is_training=False, + global_pool=False, + output_stride=output_stride) + if output_stride is None: + factor = 1 + else: + factor = nominal_stride // output_stride + output = resnet_utils.subsample(output, factor) + # Make the two networks use the same weights. + tf.get_variable_scope().reuse_variables() + # Feature extraction at the nominal network rate. + expected, _ = self._resnet_small(inputs, None, + is_training=False, + global_pool=False) + sess.run(tf.initialize_all_variables()) + self.assertAllClose(output.eval(), expected.eval(), + atol=1e-4, rtol=1e-4) + + def testUnknownBatchSize(self): + batch = 2 + height, width = 65, 65 + global_pool = True + num_classes = 10 + inputs = create_test_input(None, height, width, 3) + with slim.arg_scope(resnet_utils.resnet_arg_scope()): + logits, _ = self._resnet_small(inputs, num_classes, + global_pool=global_pool, + scope='resnet') + self.assertTrue(logits.op.name.startswith('resnet/logits')) + self.assertListEqual(logits.get_shape().as_list(), + [None, 1, 1, num_classes]) + images = create_test_input(batch, height, width, 3) + with self.test_session() as sess: + sess.run(tf.initialize_all_variables()) + output = sess.run(logits, {inputs: images.eval()}) + self.assertEqual(output.shape, (batch, 1, 1, num_classes)) + + def testFullyConvolutionalUnknownHeightWidth(self): + batch = 2 + height, width = 65, 65 + global_pool = False + inputs = create_test_input(batch, None, None, 3) + with slim.arg_scope(resnet_utils.resnet_arg_scope()): + output, _ = self._resnet_small(inputs, None, + global_pool=global_pool) + self.assertListEqual(output.get_shape().as_list(), + [batch, None, None, 32]) + images = create_test_input(batch, height, width, 3) + with self.test_session() as sess: + sess.run(tf.initialize_all_variables()) + output = sess.run(output, {inputs: images.eval()}) + self.assertEqual(output.shape, (batch, 3, 3, 32)) + + def testAtrousFullyConvolutionalUnknownHeightWidth(self): + batch = 2 + height, width = 65, 65 + global_pool = False + output_stride = 8 + inputs = create_test_input(batch, None, None, 3) + with slim.arg_scope(resnet_utils.resnet_arg_scope()): + output, _ = self._resnet_small(inputs, + None, + global_pool=global_pool, + output_stride=output_stride) + self.assertListEqual(output.get_shape().as_list(), + [batch, None, None, 32]) + images = create_test_input(batch, height, width, 3) + with self.test_session() as sess: + sess.run(tf.initialize_all_variables()) + output = sess.run(output, {inputs: images.eval()}) + self.assertEqual(output.shape, (batch, 9, 9, 32)) + + +if __name__ == '__main__': + tf.test.main() diff --git a/nets/vgg.py b/nets/vgg.py new file mode 100644 index 0000000..8c1821e --- /dev/null +++ b/nets/vgg.py @@ -0,0 +1,244 @@ +# Copyright 2016 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Contains model definitions for versions of the Oxford VGG network. + +These model definitions were introduced in the following technical report: + + Very Deep Convolutional Networks For Large-Scale Image Recognition + Karen Simonyan and Andrew Zisserman + arXiv technical report, 2015 + PDF: http://arxiv.org/pdf/1409.1556.pdf + ILSVRC 2014 Slides: http://www.robots.ox.ac.uk/~karen/pdf/ILSVRC_2014.pdf + CC-BY-4.0 + +More information can be obtained from the VGG website: +www.robots.ox.ac.uk/~vgg/research/very_deep/ + +Usage: + with slim.arg_scope(vgg.vgg_arg_scope()): + outputs, end_points = vgg.vgg_a(inputs) + + with slim.arg_scope(vgg.vgg_arg_scope()): + outputs, end_points = vgg.vgg_16(inputs) + +@@vgg_a +@@vgg_16 +@@vgg_19 +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import tensorflow as tf + +slim = tf.contrib.slim + + +def vgg_arg_scope(weight_decay=0.0005): + """Defines the VGG arg scope. + + Args: + weight_decay: The l2 regularization coefficient. + + Returns: + An arg_scope. + """ + with slim.arg_scope([slim.conv2d, slim.fully_connected], + activation_fn=tf.nn.relu, + weights_regularizer=slim.l2_regularizer(weight_decay), + biases_initializer=tf.zeros_initializer): + with slim.arg_scope([slim.conv2d], padding='SAME') as arg_sc: + return arg_sc + + +def vgg_a(inputs, + num_classes=1000, + is_training=True, + dropout_keep_prob=0.5, + spatial_squeeze=True, + scope='vgg_a'): + """Oxford Net VGG 11-Layers version A Example. + + Note: All the fully_connected layers have been transformed to conv2d layers. + To use in classification mode, resize input to 224x224. + + Args: + inputs: a tensor of size [batch_size, height, width, channels]. + num_classes: number of predicted classes. + is_training: whether or not the model is being trained. + dropout_keep_prob: the probability that activations are kept in the dropout + layers during training. + spatial_squeeze: whether or not should squeeze the spatial dimensions of the + outputs. Useful to remove unnecessary dimensions for classification. + scope: Optional scope for the variables. + + Returns: + the last op containing the log predictions and end_points dict. + """ + with tf.variable_scope(scope, 'vgg_a', [inputs]) as sc: + end_points_collection = sc.name + '_end_points' + # Collect outputs for conv2d, fully_connected and max_pool2d. + with slim.arg_scope([slim.conv2d, slim.max_pool2d], + outputs_collections=end_points_collection): + net = slim.repeat(inputs, 1, slim.conv2d, 64, [3, 3], scope='conv1') + net = slim.max_pool2d(net, [2, 2], scope='pool1') + net = slim.repeat(net, 1, slim.conv2d, 128, [3, 3], scope='conv2') + net = slim.max_pool2d(net, [2, 2], scope='pool2') + net = slim.repeat(net, 2, slim.conv2d, 256, [3, 3], scope='conv3') + net = slim.max_pool2d(net, [2, 2], scope='pool3') + net = slim.repeat(net, 2, slim.conv2d, 512, [3, 3], scope='conv4') + net = slim.max_pool2d(net, [2, 2], scope='pool4') + net = slim.repeat(net, 2, slim.conv2d, 512, [3, 3], scope='conv5') + net = slim.max_pool2d(net, [2, 2], scope='pool5') + # Use conv2d instead of fully_connected layers. + net = slim.conv2d(net, 4096, [7, 7], padding='VALID', scope='fc6') + net = slim.dropout(net, dropout_keep_prob, is_training=is_training, + scope='dropout6') + net = slim.conv2d(net, 4096, [1, 1], scope='fc7') + net = slim.dropout(net, dropout_keep_prob, is_training=is_training, + scope='dropout7') + net = slim.conv2d(net, num_classes, [1, 1], + activation_fn=None, + normalizer_fn=None, + scope='fc8') + # Convert end_points_collection into a end_point dict. + end_points = dict(tf.get_collection(end_points_collection)) + if spatial_squeeze: + net = tf.squeeze(net, [1, 2], name='fc8/squeezed') + end_points[sc.name + '/fc8'] = net + return net, end_points +vgg_a.default_image_size = 224 + + +def vgg_16(inputs, + num_classes=1000, + is_training=True, + dropout_keep_prob=0.5, + spatial_squeeze=True, + scope='vgg_16'): + """Oxford Net VGG 16-Layers version D Example. + + Note: All the fully_connected layers have been transformed to conv2d layers. + To use in classification mode, resize input to 224x224. + + Args: + inputs: a tensor of size [batch_size, height, width, channels]. + num_classes: number of predicted classes. + is_training: whether or not the model is being trained. + dropout_keep_prob: the probability that activations are kept in the dropout + layers during training. + spatial_squeeze: whether or not should squeeze the spatial dimensions of the + outputs. Useful to remove unnecessary dimensions for classification. + scope: Optional scope for the variables. + + Returns: + the last op containing the log predictions and end_points dict. + """ + with tf.variable_scope(scope, 'vgg_16', [inputs]) as sc: + end_points_collection = sc.name + '_end_points' + # Collect outputs for conv2d, fully_connected and max_pool2d. + with slim.arg_scope([slim.conv2d, slim.fully_connected, slim.max_pool2d], + outputs_collections=end_points_collection): + net = slim.repeat(inputs, 2, slim.conv2d, 64, [3, 3], scope='conv1') + net = slim.max_pool2d(net, [2, 2], scope='pool1') + net = slim.repeat(net, 2, slim.conv2d, 128, [3, 3], scope='conv2') + net = slim.max_pool2d(net, [2, 2], scope='pool2') + net = slim.repeat(net, 3, slim.conv2d, 256, [3, 3], scope='conv3') + net = slim.max_pool2d(net, [2, 2], scope='pool3') + net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3], scope='conv4') + net = slim.max_pool2d(net, [2, 2], scope='pool4') + net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3], scope='conv5') + net = slim.max_pool2d(net, [2, 2], scope='pool5') + # Use conv2d instead of fully_connected layers. + net = slim.conv2d(net, 4096, [7, 7], padding='VALID', scope='fc6') + net = slim.dropout(net, dropout_keep_prob, is_training=is_training, + scope='dropout6') + net = slim.conv2d(net, 4096, [1, 1], scope='fc7') + net = slim.dropout(net, dropout_keep_prob, is_training=is_training, + scope='dropout7') + net = slim.conv2d(net, num_classes, [1, 1], + activation_fn=None, + normalizer_fn=None, + scope='fc8') + # Convert end_points_collection into a end_point dict. + end_points = dict(tf.get_collection(end_points_collection)) + if spatial_squeeze: + net = tf.squeeze(net, [1, 2], name='fc8/squeezed') + end_points[sc.name + '/fc8'] = net + return net, end_points +vgg_16.default_image_size = 224 + + +def vgg_19(inputs, + num_classes=1000, + is_training=True, + dropout_keep_prob=0.5, + spatial_squeeze=True, + scope='vgg_19'): + """Oxford Net VGG 19-Layers version E Example. + + Note: All the fully_connected layers have been transformed to conv2d layers. + To use in classification mode, resize input to 224x224. + + Args: + inputs: a tensor of size [batch_size, height, width, channels]. + num_classes: number of predicted classes. + is_training: whether or not the model is being trained. + dropout_keep_prob: the probability that activations are kept in the dropout + layers during training. + spatial_squeeze: whether or not should squeeze the spatial dimensions of the + outputs. Useful to remove unnecessary dimensions for classification. + scope: Optional scope for the variables. + + Returns: + the last op containing the log predictions and end_points dict. + """ + with tf.variable_scope(scope, 'vgg_19', [inputs]) as sc: + end_points_collection = sc.name + '_end_points' + # Collect outputs for conv2d, fully_connected and max_pool2d. + with slim.arg_scope([slim.conv2d, slim.fully_connected, slim.max_pool2d], + outputs_collections=end_points_collection): + net = slim.repeat(inputs, 2, slim.conv2d, 64, [3, 3], scope='conv1') + net = slim.max_pool2d(net, [2, 2], scope='pool1') + net = slim.repeat(net, 2, slim.conv2d, 128, [3, 3], scope='conv2') + net = slim.max_pool2d(net, [2, 2], scope='pool2') + net = slim.repeat(net, 4, slim.conv2d, 256, [3, 3], scope='conv3') + net = slim.max_pool2d(net, [2, 2], scope='pool3') + net = slim.repeat(net, 4, slim.conv2d, 512, [3, 3], scope='conv4') + net = slim.max_pool2d(net, [2, 2], scope='pool4') + net = slim.repeat(net, 4, slim.conv2d, 512, [3, 3], scope='conv5') + net = slim.max_pool2d(net, [2, 2], scope='pool5') + # Use conv2d instead of fully_connected layers. + net = slim.conv2d(net, 4096, [7, 7], padding='VALID', scope='fc6') + net = slim.dropout(net, dropout_keep_prob, is_training=is_training, + scope='dropout6') + net = slim.conv2d(net, 4096, [1, 1], scope='fc7') + net = slim.dropout(net, dropout_keep_prob, is_training=is_training, + scope='dropout7') + net = slim.conv2d(net, num_classes, [1, 1], + activation_fn=None, + normalizer_fn=None, + scope='fc8') + # Convert end_points_collection into a end_point dict. + end_points = dict(tf.get_collection(end_points_collection)) + if spatial_squeeze: + net = tf.squeeze(net, [1, 2], name='fc8/squeezed') + end_points[sc.name + '/fc8'] = net + return net, end_points +vgg_19.default_image_size = 224 + +# Alias +vgg_d = vgg_16 +vgg_e = vgg_19 diff --git a/nets/vgg_test.py b/nets/vgg_test.py new file mode 100644 index 0000000..5927146 --- /dev/null +++ b/nets/vgg_test.py @@ -0,0 +1,455 @@ +# Copyright 2016 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for slim.nets.vgg.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import tensorflow as tf + +from nets import vgg + +slim = tf.contrib.slim + + +class VGGATest(tf.test.TestCase): + + def testBuild(self): + batch_size = 5 + height, width = 224, 224 + num_classes = 1000 + with self.test_session(): + inputs = tf.random_uniform((batch_size, height, width, 3)) + logits, _ = vgg.vgg_a(inputs, num_classes) + self.assertEquals(logits.op.name, 'vgg_a/fc8/squeezed') + self.assertListEqual(logits.get_shape().as_list(), + [batch_size, num_classes]) + + def testFullyConvolutional(self): + batch_size = 1 + height, width = 256, 256 + num_classes = 1000 + with self.test_session(): + inputs = tf.random_uniform((batch_size, height, width, 3)) + logits, _ = vgg.vgg_a(inputs, num_classes, spatial_squeeze=False) + self.assertEquals(logits.op.name, 'vgg_a/fc8/BiasAdd') + self.assertListEqual(logits.get_shape().as_list(), + [batch_size, 2, 2, num_classes]) + + def testEndPoints(self): + batch_size = 5 + height, width = 224, 224 + num_classes = 1000 + with self.test_session(): + inputs = tf.random_uniform((batch_size, height, width, 3)) + _, end_points = vgg.vgg_a(inputs, num_classes) + expected_names = ['vgg_a/conv1/conv1_1', + 'vgg_a/pool1', + 'vgg_a/conv2/conv2_1', + 'vgg_a/pool2', + 'vgg_a/conv3/conv3_1', + 'vgg_a/conv3/conv3_2', + 'vgg_a/pool3', + 'vgg_a/conv4/conv4_1', + 'vgg_a/conv4/conv4_2', + 'vgg_a/pool4', + 'vgg_a/conv5/conv5_1', + 'vgg_a/conv5/conv5_2', + 'vgg_a/pool5', + 'vgg_a/fc6', + 'vgg_a/fc7', + 'vgg_a/fc8' + ] + self.assertSetEqual(set(end_points.keys()), set(expected_names)) + + def testModelVariables(self): + batch_size = 5 + height, width = 224, 224 + num_classes = 1000 + with self.test_session(): + inputs = tf.random_uniform((batch_size, height, width, 3)) + vgg.vgg_a(inputs, num_classes) + expected_names = ['vgg_a/conv1/conv1_1/weights', + 'vgg_a/conv1/conv1_1/biases', + 'vgg_a/conv2/conv2_1/weights', + 'vgg_a/conv2/conv2_1/biases', + 'vgg_a/conv3/conv3_1/weights', + 'vgg_a/conv3/conv3_1/biases', + 'vgg_a/conv3/conv3_2/weights', + 'vgg_a/conv3/conv3_2/biases', + 'vgg_a/conv4/conv4_1/weights', + 'vgg_a/conv4/conv4_1/biases', + 'vgg_a/conv4/conv4_2/weights', + 'vgg_a/conv4/conv4_2/biases', + 'vgg_a/conv5/conv5_1/weights', + 'vgg_a/conv5/conv5_1/biases', + 'vgg_a/conv5/conv5_2/weights', + 'vgg_a/conv5/conv5_2/biases', + 'vgg_a/fc6/weights', + 'vgg_a/fc6/biases', + 'vgg_a/fc7/weights', + 'vgg_a/fc7/biases', + 'vgg_a/fc8/weights', + 'vgg_a/fc8/biases', + ] + model_variables = [v.op.name for v in slim.get_model_variables()] + self.assertSetEqual(set(model_variables), set(expected_names)) + + def testEvaluation(self): + batch_size = 2 + height, width = 224, 224 + num_classes = 1000 + with self.test_session(): + eval_inputs = tf.random_uniform((batch_size, height, width, 3)) + logits, _ = vgg.vgg_a(eval_inputs, is_training=False) + self.assertListEqual(logits.get_shape().as_list(), + [batch_size, num_classes]) + predictions = tf.argmax(logits, 1) + self.assertListEqual(predictions.get_shape().as_list(), [batch_size]) + + def testTrainEvalWithReuse(self): + train_batch_size = 2 + eval_batch_size = 1 + train_height, train_width = 224, 224 + eval_height, eval_width = 256, 256 + num_classes = 1000 + with self.test_session(): + train_inputs = tf.random_uniform( + (train_batch_size, train_height, train_width, 3)) + logits, _ = vgg.vgg_a(train_inputs) + self.assertListEqual(logits.get_shape().as_list(), + [train_batch_size, num_classes]) + tf.get_variable_scope().reuse_variables() + eval_inputs = tf.random_uniform( + (eval_batch_size, eval_height, eval_width, 3)) + logits, _ = vgg.vgg_a(eval_inputs, is_training=False, + spatial_squeeze=False) + self.assertListEqual(logits.get_shape().as_list(), + [eval_batch_size, 2, 2, num_classes]) + logits = tf.reduce_mean(logits, [1, 2]) + predictions = tf.argmax(logits, 1) + self.assertEquals(predictions.get_shape().as_list(), [eval_batch_size]) + + def testForward(self): + batch_size = 1 + height, width = 224, 224 + with self.test_session() as sess: + inputs = tf.random_uniform((batch_size, height, width, 3)) + logits, _ = vgg.vgg_a(inputs) + sess.run(tf.initialize_all_variables()) + output = sess.run(logits) + self.assertTrue(output.any()) + + +class VGG16Test(tf.test.TestCase): + + def testBuild(self): + batch_size = 5 + height, width = 224, 224 + num_classes = 1000 + with self.test_session(): + inputs = tf.random_uniform((batch_size, height, width, 3)) + logits, _ = vgg.vgg_16(inputs, num_classes) + self.assertEquals(logits.op.name, 'vgg_16/fc8/squeezed') + self.assertListEqual(logits.get_shape().as_list(), + [batch_size, num_classes]) + + def testFullyConvolutional(self): + batch_size = 1 + height, width = 256, 256 + num_classes = 1000 + with self.test_session(): + inputs = tf.random_uniform((batch_size, height, width, 3)) + logits, _ = vgg.vgg_16(inputs, num_classes, spatial_squeeze=False) + self.assertEquals(logits.op.name, 'vgg_16/fc8/BiasAdd') + self.assertListEqual(logits.get_shape().as_list(), + [batch_size, 2, 2, num_classes]) + + def testEndPoints(self): + batch_size = 5 + height, width = 224, 224 + num_classes = 1000 + with self.test_session(): + inputs = tf.random_uniform((batch_size, height, width, 3)) + _, end_points = vgg.vgg_16(inputs, num_classes) + expected_names = ['vgg_16/conv1/conv1_1', + 'vgg_16/conv1/conv1_2', + 'vgg_16/pool1', + 'vgg_16/conv2/conv2_1', + 'vgg_16/conv2/conv2_2', + 'vgg_16/pool2', + 'vgg_16/conv3/conv3_1', + 'vgg_16/conv3/conv3_2', + 'vgg_16/conv3/conv3_3', + 'vgg_16/pool3', + 'vgg_16/conv4/conv4_1', + 'vgg_16/conv4/conv4_2', + 'vgg_16/conv4/conv4_3', + 'vgg_16/pool4', + 'vgg_16/conv5/conv5_1', + 'vgg_16/conv5/conv5_2', + 'vgg_16/conv5/conv5_3', + 'vgg_16/pool5', + 'vgg_16/fc6', + 'vgg_16/fc7', + 'vgg_16/fc8' + ] + self.assertSetEqual(set(end_points.keys()), set(expected_names)) + + def testModelVariables(self): + batch_size = 5 + height, width = 224, 224 + num_classes = 1000 + with self.test_session(): + inputs = tf.random_uniform((batch_size, height, width, 3)) + vgg.vgg_16(inputs, num_classes) + expected_names = ['vgg_16/conv1/conv1_1/weights', + 'vgg_16/conv1/conv1_1/biases', + 'vgg_16/conv1/conv1_2/weights', + 'vgg_16/conv1/conv1_2/biases', + 'vgg_16/conv2/conv2_1/weights', + 'vgg_16/conv2/conv2_1/biases', + 'vgg_16/conv2/conv2_2/weights', + 'vgg_16/conv2/conv2_2/biases', + 'vgg_16/conv3/conv3_1/weights', + 'vgg_16/conv3/conv3_1/biases', + 'vgg_16/conv3/conv3_2/weights', + 'vgg_16/conv3/conv3_2/biases', + 'vgg_16/conv3/conv3_3/weights', + 'vgg_16/conv3/conv3_3/biases', + 'vgg_16/conv4/conv4_1/weights', + 'vgg_16/conv4/conv4_1/biases', + 'vgg_16/conv4/conv4_2/weights', + 'vgg_16/conv4/conv4_2/biases', + 'vgg_16/conv4/conv4_3/weights', + 'vgg_16/conv4/conv4_3/biases', + 'vgg_16/conv5/conv5_1/weights', + 'vgg_16/conv5/conv5_1/biases', + 'vgg_16/conv5/conv5_2/weights', + 'vgg_16/conv5/conv5_2/biases', + 'vgg_16/conv5/conv5_3/weights', + 'vgg_16/conv5/conv5_3/biases', + 'vgg_16/fc6/weights', + 'vgg_16/fc6/biases', + 'vgg_16/fc7/weights', + 'vgg_16/fc7/biases', + 'vgg_16/fc8/weights', + 'vgg_16/fc8/biases', + ] + model_variables = [v.op.name for v in slim.get_model_variables()] + self.assertSetEqual(set(model_variables), set(expected_names)) + + def testEvaluation(self): + batch_size = 2 + height, width = 224, 224 + num_classes = 1000 + with self.test_session(): + eval_inputs = tf.random_uniform((batch_size, height, width, 3)) + logits, _ = vgg.vgg_16(eval_inputs, is_training=False) + self.assertListEqual(logits.get_shape().as_list(), + [batch_size, num_classes]) + predictions = tf.argmax(logits, 1) + self.assertListEqual(predictions.get_shape().as_list(), [batch_size]) + + def testTrainEvalWithReuse(self): + train_batch_size = 2 + eval_batch_size = 1 + train_height, train_width = 224, 224 + eval_height, eval_width = 256, 256 + num_classes = 1000 + with self.test_session(): + train_inputs = tf.random_uniform( + (train_batch_size, train_height, train_width, 3)) + logits, _ = vgg.vgg_16(train_inputs) + self.assertListEqual(logits.get_shape().as_list(), + [train_batch_size, num_classes]) + tf.get_variable_scope().reuse_variables() + eval_inputs = tf.random_uniform( + (eval_batch_size, eval_height, eval_width, 3)) + logits, _ = vgg.vgg_16(eval_inputs, is_training=False, + spatial_squeeze=False) + self.assertListEqual(logits.get_shape().as_list(), + [eval_batch_size, 2, 2, num_classes]) + logits = tf.reduce_mean(logits, [1, 2]) + predictions = tf.argmax(logits, 1) + self.assertEquals(predictions.get_shape().as_list(), [eval_batch_size]) + + def testForward(self): + batch_size = 1 + height, width = 224, 224 + with self.test_session() as sess: + inputs = tf.random_uniform((batch_size, height, width, 3)) + logits, _ = vgg.vgg_16(inputs) + sess.run(tf.initialize_all_variables()) + output = sess.run(logits) + self.assertTrue(output.any()) + + +class VGG19Test(tf.test.TestCase): + + def testBuild(self): + batch_size = 5 + height, width = 224, 224 + num_classes = 1000 + with self.test_session(): + inputs = tf.random_uniform((batch_size, height, width, 3)) + logits, _ = vgg.vgg_19(inputs, num_classes) + self.assertEquals(logits.op.name, 'vgg_19/fc8/squeezed') + self.assertListEqual(logits.get_shape().as_list(), + [batch_size, num_classes]) + + def testFullyConvolutional(self): + batch_size = 1 + height, width = 256, 256 + num_classes = 1000 + with self.test_session(): + inputs = tf.random_uniform((batch_size, height, width, 3)) + logits, _ = vgg.vgg_19(inputs, num_classes, spatial_squeeze=False) + self.assertEquals(logits.op.name, 'vgg_19/fc8/BiasAdd') + self.assertListEqual(logits.get_shape().as_list(), + [batch_size, 2, 2, num_classes]) + + def testEndPoints(self): + batch_size = 5 + height, width = 224, 224 + num_classes = 1000 + with self.test_session(): + inputs = tf.random_uniform((batch_size, height, width, 3)) + _, end_points = vgg.vgg_19(inputs, num_classes) + expected_names = [ + 'vgg_19/conv1/conv1_1', + 'vgg_19/conv1/conv1_2', + 'vgg_19/pool1', + 'vgg_19/conv2/conv2_1', + 'vgg_19/conv2/conv2_2', + 'vgg_19/pool2', + 'vgg_19/conv3/conv3_1', + 'vgg_19/conv3/conv3_2', + 'vgg_19/conv3/conv3_3', + 'vgg_19/conv3/conv3_4', + 'vgg_19/pool3', + 'vgg_19/conv4/conv4_1', + 'vgg_19/conv4/conv4_2', + 'vgg_19/conv4/conv4_3', + 'vgg_19/conv4/conv4_4', + 'vgg_19/pool4', + 'vgg_19/conv5/conv5_1', + 'vgg_19/conv5/conv5_2', + 'vgg_19/conv5/conv5_3', + 'vgg_19/conv5/conv5_4', + 'vgg_19/pool5', + 'vgg_19/fc6', + 'vgg_19/fc7', + 'vgg_19/fc8' + ] + self.assertSetEqual(set(end_points.keys()), set(expected_names)) + + def testModelVariables(self): + batch_size = 5 + height, width = 224, 224 + num_classes = 1000 + with self.test_session(): + inputs = tf.random_uniform((batch_size, height, width, 3)) + vgg.vgg_19(inputs, num_classes) + expected_names = [ + 'vgg_19/conv1/conv1_1/weights', + 'vgg_19/conv1/conv1_1/biases', + 'vgg_19/conv1/conv1_2/weights', + 'vgg_19/conv1/conv1_2/biases', + 'vgg_19/conv2/conv2_1/weights', + 'vgg_19/conv2/conv2_1/biases', + 'vgg_19/conv2/conv2_2/weights', + 'vgg_19/conv2/conv2_2/biases', + 'vgg_19/conv3/conv3_1/weights', + 'vgg_19/conv3/conv3_1/biases', + 'vgg_19/conv3/conv3_2/weights', + 'vgg_19/conv3/conv3_2/biases', + 'vgg_19/conv3/conv3_3/weights', + 'vgg_19/conv3/conv3_3/biases', + 'vgg_19/conv3/conv3_4/weights', + 'vgg_19/conv3/conv3_4/biases', + 'vgg_19/conv4/conv4_1/weights', + 'vgg_19/conv4/conv4_1/biases', + 'vgg_19/conv4/conv4_2/weights', + 'vgg_19/conv4/conv4_2/biases', + 'vgg_19/conv4/conv4_3/weights', + 'vgg_19/conv4/conv4_3/biases', + 'vgg_19/conv4/conv4_4/weights', + 'vgg_19/conv4/conv4_4/biases', + 'vgg_19/conv5/conv5_1/weights', + 'vgg_19/conv5/conv5_1/biases', + 'vgg_19/conv5/conv5_2/weights', + 'vgg_19/conv5/conv5_2/biases', + 'vgg_19/conv5/conv5_3/weights', + 'vgg_19/conv5/conv5_3/biases', + 'vgg_19/conv5/conv5_4/weights', + 'vgg_19/conv5/conv5_4/biases', + 'vgg_19/fc6/weights', + 'vgg_19/fc6/biases', + 'vgg_19/fc7/weights', + 'vgg_19/fc7/biases', + 'vgg_19/fc8/weights', + 'vgg_19/fc8/biases', + ] + model_variables = [v.op.name for v in slim.get_model_variables()] + self.assertSetEqual(set(model_variables), set(expected_names)) + + def testEvaluation(self): + batch_size = 2 + height, width = 224, 224 + num_classes = 1000 + with self.test_session(): + eval_inputs = tf.random_uniform((batch_size, height, width, 3)) + logits, _ = vgg.vgg_19(eval_inputs, is_training=False) + self.assertListEqual(logits.get_shape().as_list(), + [batch_size, num_classes]) + predictions = tf.argmax(logits, 1) + self.assertListEqual(predictions.get_shape().as_list(), [batch_size]) + + def testTrainEvalWithReuse(self): + train_batch_size = 2 + eval_batch_size = 1 + train_height, train_width = 224, 224 + eval_height, eval_width = 256, 256 + num_classes = 1000 + with self.test_session(): + train_inputs = tf.random_uniform( + (train_batch_size, train_height, train_width, 3)) + logits, _ = vgg.vgg_19(train_inputs) + self.assertListEqual(logits.get_shape().as_list(), + [train_batch_size, num_classes]) + tf.get_variable_scope().reuse_variables() + eval_inputs = tf.random_uniform( + (eval_batch_size, eval_height, eval_width, 3)) + logits, _ = vgg.vgg_19(eval_inputs, is_training=False, + spatial_squeeze=False) + self.assertListEqual(logits.get_shape().as_list(), + [eval_batch_size, 2, 2, num_classes]) + logits = tf.reduce_mean(logits, [1, 2]) + predictions = tf.argmax(logits, 1) + self.assertEquals(predictions.get_shape().as_list(), [eval_batch_size]) + + def testForward(self): + batch_size = 1 + height, width = 224, 224 + with self.test_session() as sess: + inputs = tf.random_uniform((batch_size, height, width, 3)) + logits, _ = vgg.vgg_19(inputs) + sess.run(tf.initialize_all_variables()) + output = sess.run(logits) + self.assertTrue(output.any()) + +if __name__ == '__main__': + tf.test.main()