From 3dfc9ba8b3dd8edbdd117d04607b1fb87443fa96 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Geron?= Date: Thu, 24 Nov 2016 17:23:11 +0100 Subject: [PATCH] Add word embeddings and NLP stub --- 14_recurrent_neural_networks.ipynb | 517 +++++++++++++++++++++++++++-- 1 file changed, 492 insertions(+), 25 deletions(-) diff --git a/14_recurrent_neural_networks.ipynb b/14_recurrent_neural_networks.ipynb index 10e4d9b..c950828 100644 --- a/14_recurrent_neural_networks.ipynb +++ b/14_recurrent_neural_networks.ipynb @@ -515,7 +515,7 @@ "X = tf.placeholder(tf.float32, [None, n_steps, n_inputs])\n", "y = tf.placeholder(tf.int32, [None])\n", "\n", - "with tf.variable_scope(\"\", initializer=tf.contrib.layers.variance_scaling_initializer()):\n", + "with tf.variable_scope(\"rnn\", initializer=tf.contrib.layers.variance_scaling_initializer()):\n", " basic_cell = tf.nn.rnn_cell.BasicRNNCell(num_units=n_neurons, activation=tf.nn.relu)\n", " outputs, states = tf.nn.dynamic_rnn(basic_cell, X, dtype=tf.float32)\n", "\n", @@ -578,7 +578,7 @@ "cell_type": "code", "execution_count": 26, "metadata": { - "collapsed": true + "collapsed": false }, "outputs": [], "source": [ @@ -600,8 +600,8 @@ "hidden1 = tf.nn.rnn_cell.BasicRNNCell(num_units=n_neurons1, activation=tf.nn.relu)\n", "hidden2 = tf.nn.rnn_cell.BasicRNNCell(num_units=n_neurons2, activation=tf.nn.relu)\n", "multi_layer_cell = tf.nn.rnn_cell.MultiRNNCell([hidden1, hidden2])\n", - "outputs, states = tf.nn.dynamic_rnn(multi_layer_cell, X, dtype=tf.float32)\n", - "\n", + "outputs, states_tuple = tf.nn.dynamic_rnn(multi_layer_cell, X, dtype=tf.float32)\n", + "states = tf.concat(concat_dim=1, values=states_tuple)\n", "logits = fully_connected(states, n_outputs, activation_fn=None)\n", "xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits, y)\n", "loss = tf.reduce_mean(xentropy)\n", @@ -1028,7 +1028,7 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 43, "metadata": { "collapsed": false }, @@ -1074,7 +1074,7 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 44, "metadata": { "collapsed": false }, @@ -1116,7 +1116,7 @@ }, { "cell_type": "code", - "execution_count": 55, + "execution_count": 45, "metadata": { "collapsed": false }, @@ -1136,8 +1136,8 @@ "X = tf.placeholder(tf.float32, [None, n_steps, n_inputs])\n", "y = tf.placeholder(tf.int32, [None])\n", "\n", - "lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=n_neurons, state_is_tuple=True)\n", - "multi_cell = tf.nn.rnn_cell.MultiRNNCell([lstm_cell]*3, state_is_tuple=True)\n", + "lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=n_neurons)\n", + "multi_cell = tf.nn.rnn_cell.MultiRNNCell([lstm_cell]*3)\n", "outputs, states = tf.nn.dynamic_rnn(multi_cell, X, dtype=tf.float32)\n", "top_layer_h_state = states[-1][1]\n", "logits = fully_connected(top_layer_h_state, n_outputs, activation_fn=None, scope=\"softmax\")\n", @@ -1153,7 +1153,7 @@ }, { "cell_type": "code", - "execution_count": 56, + "execution_count": 46, "metadata": { "collapsed": false }, @@ -1164,7 +1164,7 @@ }, { "cell_type": "code", - "execution_count": 57, + "execution_count": 47, "metadata": { "collapsed": false }, @@ -1175,7 +1175,7 @@ }, { "cell_type": "code", - "execution_count": 58, + "execution_count": 48, "metadata": { "collapsed": false }, @@ -1205,7 +1205,7 @@ }, { "cell_type": "code", - "execution_count": 59, + "execution_count": 49, "metadata": { "collapsed": false }, @@ -1233,7 +1233,7 @@ }, { "cell_type": "code", - "execution_count": 60, + "execution_count": 50, "metadata": { "collapsed": false }, @@ -1255,7 +1255,7 @@ }, { "cell_type": "code", - "execution_count": 61, + "execution_count": 51, "metadata": { "collapsed": false }, @@ -1266,6 +1266,482 @@ " print(sess.run(outputs, feed_dict={X: rnd.rand(2, n_steps, n_inputs)}))" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Embeddings" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This section is based on TensorFlow's [Word2Vec tutorial](https://www.tensorflow.org/versions/r0.11/tutorials/word2vec/index.html)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Fetch the data" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "from six.moves import urllib\n", + "\n", + "import os\n", + "import zipfile\n", + "import urllib.request\n", + "\n", + "WORDS_PATH = \"datasets/words\"\n", + "WORDS_URL = 'http://mattmahoney.net/dc/text8.zip'\n", + "\n", + "def fetch_words_data(words_url=WORDS_URL, words_path=WORDS_PATH):\n", + " os.makedirs(words_path, exist_ok=True)\n", + " zip_path = os.path.join(words_path, \"words.zip\")\n", + " if not os.path.exists(zip_path):\n", + " urllib.request.urlretrieve(words_url, zip_path)\n", + " with zipfile.ZipFile(zip_path) as f:\n", + " data = f.read(f.namelist()[0])\n", + " return data.decode(\"ascii\").split()" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "words = fetch_words_data()" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "words[:5]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Build the dictionary" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from collections import Counter\n", + "\n", + "vocabulary_size = 50000\n", + "\n", + "vocabulary = [(\"UNK\", None)] + Counter(words).most_common(vocabulary_size - 1)\n", + "vocabulary = np.array([word for word, _ in vocabulary])\n", + "dictionary = {word: code for code, word in enumerate(vocabulary)}\n", + "data = np.array([dictionary.get(word, 0) for word in words])" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "\" \".join(words[:9]), data[:9]" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "\" \".join([vocabulary[word_index] for word_index in [5241, 3081, 12, 6, 195, 2, 3134, 46, 59]])" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "words[24], data[24]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Generate batches" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import random\n", + "from collections import deque\n", + "\n", + "def generate_batch(batch_size, num_skips, skip_window):\n", + " global data_index\n", + " assert batch_size % num_skips == 0\n", + " assert num_skips <= 2 * skip_window\n", + " batch = np.ndarray(shape=(batch_size), dtype=np.int32)\n", + " labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)\n", + " span = 2 * skip_window + 1 # [ skip_window target skip_window ]\n", + " buffer = deque(maxlen=span)\n", + " for _ in range(span):\n", + " buffer.append(data[data_index])\n", + " data_index = (data_index + 1) % len(data)\n", + " for i in range(batch_size // num_skips):\n", + " target = skip_window # target label at the center of the buffer\n", + " targets_to_avoid = [ skip_window ]\n", + " for j in range(num_skips):\n", + " while target in targets_to_avoid:\n", + " target = random.randint(0, span - 1)\n", + " targets_to_avoid.append(target)\n", + " batch[i * num_skips + j] = buffer[skip_window]\n", + " labels[i * num_skips + j, 0] = buffer[target]\n", + " buffer.append(data[data_index])\n", + " data_index = (data_index + 1) % len(data)\n", + " return batch, labels" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data_index=0\n", + "batch, labels = generate_batch(8, 2, 1)" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "batch, [vocabulary[word] for word in batch]" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "labels, [vocabulary[word] for word in labels[:, 0]]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Build the model" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "batch_size = 128\n", + "embedding_size = 128 # Dimension of the embedding vector.\n", + "skip_window = 1 # How many words to consider left and right.\n", + "num_skips = 2 # How many times to reuse an input to generate a label.\n", + "\n", + "# We pick a random validation set to sample nearest neighbors. Here we limit the\n", + "# validation samples to the words that have a low numeric ID, which by\n", + "# construction are also the most frequent.\n", + "valid_size = 16 # Random set of words to evaluate similarity on.\n", + "valid_window = 100 # Only pick dev samples in the head of the distribution.\n", + "valid_examples = rnd.choice(valid_window, valid_size, replace=False)\n", + "num_sampled = 64 # Number of negative examples to sample.\n", + "\n", + "learning_rate = 0.01" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "tf.reset_default_graph()\n", + "\n", + "# Input data.\n", + "train_inputs = tf.placeholder(tf.int32, shape=[batch_size])\n", + "train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])\n", + "valid_dataset = tf.constant(valid_examples, dtype=tf.int32)\n", + "\n", + "# Look up embeddings for inputs.\n", + "init_embeddings = tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0)\n", + "embeddings = tf.Variable(init_embeddings)\n", + "embed = tf.nn.embedding_lookup(embeddings, train_inputs)\n", + "\n", + "# Construct the variables for the NCE loss\n", + "nce_weights = tf.Variable(\n", + " tf.truncated_normal([vocabulary_size, embedding_size],\n", + " stddev=1.0 / np.sqrt(embedding_size)))\n", + "nce_biases = tf.Variable(tf.zeros([vocabulary_size]))\n", + "\n", + "# Compute the average NCE loss for the batch.\n", + "# tf.nce_loss automatically draws a new sample of the negative labels each\n", + "# time we evaluate the loss.\n", + "loss = tf.reduce_mean(\n", + " tf.nn.nce_loss(nce_weights, nce_biases, embed, train_labels,\n", + " num_sampled, vocabulary_size))\n", + "\n", + "# Construct the Adam optimizer\n", + "optimizer = tf.train.AdamOptimizer(learning_rate)\n", + "training_op = optimizer.minimize(loss)\n", + "\n", + "# Compute the cosine similarity between minibatch examples and all embeddings.\n", + "norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), reduction_indices=1, keep_dims=True))\n", + "normalized_embeddings = embeddings / norm\n", + "valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)\n", + "similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True)\n", + "\n", + "# Add variable initializer.\n", + "init = tf.initialize_all_variables()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Train the model" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "num_steps = 100001\n", + "\n", + "with tf.Session() as session:\n", + " init.run()\n", + "\n", + " average_loss = 0\n", + " for step in range(num_steps):\n", + " print(\"\\rIteration: {}\".format(step), end=\"\\t\")\n", + " batch_inputs, batch_labels = generate_batch(batch_size, num_skips, skip_window)\n", + " feed_dict = {train_inputs : batch_inputs, train_labels : batch_labels}\n", + "\n", + " # We perform one update step by evaluating the training op (including it\n", + " # in the list of returned values for session.run()\n", + " _, loss_val = session.run([training_op, loss], feed_dict=feed_dict)\n", + " average_loss += loss_val\n", + "\n", + " if step % 2000 == 0:\n", + " if step > 0:\n", + " average_loss /= 2000\n", + " # The average loss is an estimate of the loss over the last 2000 batches.\n", + " print(\"Average loss at step \", step, \": \", average_loss)\n", + " average_loss = 0\n", + "\n", + " # Note that this is expensive (~20% slowdown if computed every 500 steps)\n", + " if step % 10000 == 0:\n", + " sim = similarity.eval()\n", + " for i in range(valid_size):\n", + " valid_word = vocabulary[valid_examples[i]]\n", + " top_k = 8 # number of nearest neighbors\n", + " nearest = (-sim[i, :]).argsort()[1:top_k+1]\n", + " log_str = \"Nearest to %s:\" % valid_word\n", + " for k in range(top_k):\n", + " close_word = vocabulary[nearest[k]]\n", + " log_str = \"%s %s,\" % (log_str, close_word)\n", + " print(log_str)\n", + "\n", + " final_embeddings = normalized_embeddings.eval()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's save the final embeddings (of course you can use a TensorFlow `Saver` if you prefer):" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "np.save(\"my_final_embeddings.npy\", final_embeddings)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Plot the embeddings" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def plot_with_labels(low_dim_embs, labels):\n", + " assert low_dim_embs.shape[0] >= len(labels), \"More labels than embeddings\"\n", + " plt.figure(figsize=(18, 18)) #in inches\n", + " for i, label in enumerate(labels):\n", + " x, y = low_dim_embs[i,:]\n", + " plt.scatter(x, y)\n", + " plt.annotate(label,\n", + " xy=(x, y),\n", + " xytext=(5, 2),\n", + " textcoords='offset points',\n", + " ha='right',\n", + " va='bottom')" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from sklearn.manifold import TSNE\n", + "\n", + "tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)\n", + "plot_only = 500\n", + "low_dim_embs = tsne.fit_transform(final_embeddings[:plot_only,:])\n", + "labels = [vocabulary[i] for i in range(plot_only)]\n", + "plot_with_labels(low_dim_embs, labels)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Machine Translation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The `basic_rnn_seq2seq()` function creates a simple Encoder/Decoder model: it first runs an RNN to encode `encoder_inputs` into a state vector, then runs a decoder initialized with the last encoder state on `decoder_inputs`. Encoder and decoder use the same RNN cell type but they don't share parameters." + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import tensorflow as tf\n", + "tf.reset_default_graph()\n", + "\n", + "n_steps = 50\n", + "n_neurons = 200\n", + "n_layers = 3\n", + "num_encoder_symbols = 20000\n", + "num_decoder_symbols = 20000\n", + "embedding_size = 150\n", + "learning_rate = 0.01\n", + "\n", + "X = tf.placeholder(tf.int32, [None, n_steps]) # English sentences\n", + "Y = tf.placeholder(tf.int32, [None, n_steps]) # French translations\n", + "W = tf.placeholder(tf.float32, [None, n_steps - 1, 1])\n", + "Y_input = Y[:, :-1]\n", + "Y_target = Y[:, 1:]\n", + "\n", + "encoder_inputs = tf.unpack(tf.transpose(X)) # list of 1D tensors\n", + "decoder_inputs = tf.unpack(tf.transpose(Y_input)) # list of 1D tensors\n", + "\n", + "lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=n_neurons)\n", + "cell = tf.nn.rnn_cell.MultiRNNCell([lstm_cell] * n_layers)\n", + "\n", + "output_seqs, states = tf.nn.seq2seq.embedding_rnn_seq2seq(\n", + " encoder_inputs,\n", + " decoder_inputs,\n", + " cell,\n", + " num_encoder_symbols,\n", + " num_decoder_symbols,\n", + " embedding_size)\n", + "\n", + "logits = tf.transpose(tf.pack(output_seqs), perm=[1, 0, 2])" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "logits_flat = tf.reshape(logits, [-1, num_decoder_symbols])\n", + "Y_target_flat = tf.reshape(Y_target, [-1])\n", + "W_flat = tf.reshape(W, [-1])\n", + "xentropy = W_flat * tf.nn.sparse_softmax_cross_entropy_with_logits(logits_flat, Y_target_flat)\n", + "loss = tf.reduce_mean(xentropy)\n", + "optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)\n", + "training_op = optimizer.minimize(loss)\n", + "\n", + "init = tf.initialize_all_variables()" + ] + }, { "cell_type": "markdown", "metadata": { @@ -1281,15 +1757,6 @@ "source": [ "**Coming soon**" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] } ], "metadata": { @@ -1308,7 +1775,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.1" + "version": "3.5.2" }, "nav_menu": {}, "toc": {