Add word embeddings and NLP stub
parent
ccb28831cb
commit
3dfc9ba8b3
|
@ -515,7 +515,7 @@
|
||||||
"X = tf.placeholder(tf.float32, [None, n_steps, n_inputs])\n",
|
"X = tf.placeholder(tf.float32, [None, n_steps, n_inputs])\n",
|
||||||
"y = tf.placeholder(tf.int32, [None])\n",
|
"y = tf.placeholder(tf.int32, [None])\n",
|
||||||
"\n",
|
"\n",
|
||||||
"with tf.variable_scope(\"\", initializer=tf.contrib.layers.variance_scaling_initializer()):\n",
|
"with tf.variable_scope(\"rnn\", initializer=tf.contrib.layers.variance_scaling_initializer()):\n",
|
||||||
" basic_cell = tf.nn.rnn_cell.BasicRNNCell(num_units=n_neurons, activation=tf.nn.relu)\n",
|
" basic_cell = tf.nn.rnn_cell.BasicRNNCell(num_units=n_neurons, activation=tf.nn.relu)\n",
|
||||||
" outputs, states = tf.nn.dynamic_rnn(basic_cell, X, dtype=tf.float32)\n",
|
" outputs, states = tf.nn.dynamic_rnn(basic_cell, X, dtype=tf.float32)\n",
|
||||||
"\n",
|
"\n",
|
||||||
|
@ -578,7 +578,7 @@
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 26,
|
"execution_count": 26,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": true
|
"collapsed": false
|
||||||
},
|
},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
|
@ -600,8 +600,8 @@
|
||||||
"hidden1 = tf.nn.rnn_cell.BasicRNNCell(num_units=n_neurons1, activation=tf.nn.relu)\n",
|
"hidden1 = tf.nn.rnn_cell.BasicRNNCell(num_units=n_neurons1, activation=tf.nn.relu)\n",
|
||||||
"hidden2 = tf.nn.rnn_cell.BasicRNNCell(num_units=n_neurons2, activation=tf.nn.relu)\n",
|
"hidden2 = tf.nn.rnn_cell.BasicRNNCell(num_units=n_neurons2, activation=tf.nn.relu)\n",
|
||||||
"multi_layer_cell = tf.nn.rnn_cell.MultiRNNCell([hidden1, hidden2])\n",
|
"multi_layer_cell = tf.nn.rnn_cell.MultiRNNCell([hidden1, hidden2])\n",
|
||||||
"outputs, states = tf.nn.dynamic_rnn(multi_layer_cell, X, dtype=tf.float32)\n",
|
"outputs, states_tuple = tf.nn.dynamic_rnn(multi_layer_cell, X, dtype=tf.float32)\n",
|
||||||
"\n",
|
"states = tf.concat(concat_dim=1, values=states_tuple)\n",
|
||||||
"logits = fully_connected(states, n_outputs, activation_fn=None)\n",
|
"logits = fully_connected(states, n_outputs, activation_fn=None)\n",
|
||||||
"xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits, y)\n",
|
"xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits, y)\n",
|
||||||
"loss = tf.reduce_mean(xentropy)\n",
|
"loss = tf.reduce_mean(xentropy)\n",
|
||||||
|
@ -1028,7 +1028,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 45,
|
"execution_count": 43,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false
|
"collapsed": false
|
||||||
},
|
},
|
||||||
|
@ -1074,7 +1074,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 46,
|
"execution_count": 44,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false
|
"collapsed": false
|
||||||
},
|
},
|
||||||
|
@ -1116,7 +1116,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 55,
|
"execution_count": 45,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false
|
"collapsed": false
|
||||||
},
|
},
|
||||||
|
@ -1136,8 +1136,8 @@
|
||||||
"X = tf.placeholder(tf.float32, [None, n_steps, n_inputs])\n",
|
"X = tf.placeholder(tf.float32, [None, n_steps, n_inputs])\n",
|
||||||
"y = tf.placeholder(tf.int32, [None])\n",
|
"y = tf.placeholder(tf.int32, [None])\n",
|
||||||
"\n",
|
"\n",
|
||||||
"lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=n_neurons, state_is_tuple=True)\n",
|
"lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=n_neurons)\n",
|
||||||
"multi_cell = tf.nn.rnn_cell.MultiRNNCell([lstm_cell]*3, state_is_tuple=True)\n",
|
"multi_cell = tf.nn.rnn_cell.MultiRNNCell([lstm_cell]*3)\n",
|
||||||
"outputs, states = tf.nn.dynamic_rnn(multi_cell, X, dtype=tf.float32)\n",
|
"outputs, states = tf.nn.dynamic_rnn(multi_cell, X, dtype=tf.float32)\n",
|
||||||
"top_layer_h_state = states[-1][1]\n",
|
"top_layer_h_state = states[-1][1]\n",
|
||||||
"logits = fully_connected(top_layer_h_state, n_outputs, activation_fn=None, scope=\"softmax\")\n",
|
"logits = fully_connected(top_layer_h_state, n_outputs, activation_fn=None, scope=\"softmax\")\n",
|
||||||
|
@ -1153,7 +1153,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 56,
|
"execution_count": 46,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false
|
"collapsed": false
|
||||||
},
|
},
|
||||||
|
@ -1164,7 +1164,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 57,
|
"execution_count": 47,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false
|
"collapsed": false
|
||||||
},
|
},
|
||||||
|
@ -1175,7 +1175,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 58,
|
"execution_count": 48,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false
|
"collapsed": false
|
||||||
},
|
},
|
||||||
|
@ -1205,7 +1205,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 59,
|
"execution_count": 49,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false
|
"collapsed": false
|
||||||
},
|
},
|
||||||
|
@ -1233,7 +1233,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 60,
|
"execution_count": 50,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false
|
"collapsed": false
|
||||||
},
|
},
|
||||||
|
@ -1255,7 +1255,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 61,
|
"execution_count": 51,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": false
|
"collapsed": false
|
||||||
},
|
},
|
||||||
|
@ -1266,6 +1266,482 @@
|
||||||
" print(sess.run(outputs, feed_dict={X: rnd.rand(2, n_steps, n_inputs)}))"
|
" print(sess.run(outputs, feed_dict={X: rnd.rand(2, n_steps, n_inputs)}))"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Embeddings"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"This section is based on TensorFlow's [Word2Vec tutorial](https://www.tensorflow.org/versions/r0.11/tutorials/word2vec/index.html)."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Fetch the data"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 52,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from six.moves import urllib\n",
|
||||||
|
"\n",
|
||||||
|
"import os\n",
|
||||||
|
"import zipfile\n",
|
||||||
|
"import urllib.request\n",
|
||||||
|
"\n",
|
||||||
|
"WORDS_PATH = \"datasets/words\"\n",
|
||||||
|
"WORDS_URL = 'http://mattmahoney.net/dc/text8.zip'\n",
|
||||||
|
"\n",
|
||||||
|
"def fetch_words_data(words_url=WORDS_URL, words_path=WORDS_PATH):\n",
|
||||||
|
" os.makedirs(words_path, exist_ok=True)\n",
|
||||||
|
" zip_path = os.path.join(words_path, \"words.zip\")\n",
|
||||||
|
" if not os.path.exists(zip_path):\n",
|
||||||
|
" urllib.request.urlretrieve(words_url, zip_path)\n",
|
||||||
|
" with zipfile.ZipFile(zip_path) as f:\n",
|
||||||
|
" data = f.read(f.namelist()[0])\n",
|
||||||
|
" return data.decode(\"ascii\").split()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 53,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"words = fetch_words_data()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 54,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"words[:5]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Build the dictionary"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 55,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from collections import Counter\n",
|
||||||
|
"\n",
|
||||||
|
"vocabulary_size = 50000\n",
|
||||||
|
"\n",
|
||||||
|
"vocabulary = [(\"UNK\", None)] + Counter(words).most_common(vocabulary_size - 1)\n",
|
||||||
|
"vocabulary = np.array([word for word, _ in vocabulary])\n",
|
||||||
|
"dictionary = {word: code for code, word in enumerate(vocabulary)}\n",
|
||||||
|
"data = np.array([dictionary.get(word, 0) for word in words])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 56,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"\" \".join(words[:9]), data[:9]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 57,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"\" \".join([vocabulary[word_index] for word_index in [5241, 3081, 12, 6, 195, 2, 3134, 46, 59]])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 58,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"words[24], data[24]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Generate batches"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 59,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import random\n",
|
||||||
|
"from collections import deque\n",
|
||||||
|
"\n",
|
||||||
|
"def generate_batch(batch_size, num_skips, skip_window):\n",
|
||||||
|
" global data_index\n",
|
||||||
|
" assert batch_size % num_skips == 0\n",
|
||||||
|
" assert num_skips <= 2 * skip_window\n",
|
||||||
|
" batch = np.ndarray(shape=(batch_size), dtype=np.int32)\n",
|
||||||
|
" labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)\n",
|
||||||
|
" span = 2 * skip_window + 1 # [ skip_window target skip_window ]\n",
|
||||||
|
" buffer = deque(maxlen=span)\n",
|
||||||
|
" for _ in range(span):\n",
|
||||||
|
" buffer.append(data[data_index])\n",
|
||||||
|
" data_index = (data_index + 1) % len(data)\n",
|
||||||
|
" for i in range(batch_size // num_skips):\n",
|
||||||
|
" target = skip_window # target label at the center of the buffer\n",
|
||||||
|
" targets_to_avoid = [ skip_window ]\n",
|
||||||
|
" for j in range(num_skips):\n",
|
||||||
|
" while target in targets_to_avoid:\n",
|
||||||
|
" target = random.randint(0, span - 1)\n",
|
||||||
|
" targets_to_avoid.append(target)\n",
|
||||||
|
" batch[i * num_skips + j] = buffer[skip_window]\n",
|
||||||
|
" labels[i * num_skips + j, 0] = buffer[target]\n",
|
||||||
|
" buffer.append(data[data_index])\n",
|
||||||
|
" data_index = (data_index + 1) % len(data)\n",
|
||||||
|
" return batch, labels"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 60,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"data_index=0\n",
|
||||||
|
"batch, labels = generate_batch(8, 2, 1)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 61,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"batch, [vocabulary[word] for word in batch]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 62,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"labels, [vocabulary[word] for word in labels[:, 0]]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Build the model"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 63,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"batch_size = 128\n",
|
||||||
|
"embedding_size = 128 # Dimension of the embedding vector.\n",
|
||||||
|
"skip_window = 1 # How many words to consider left and right.\n",
|
||||||
|
"num_skips = 2 # How many times to reuse an input to generate a label.\n",
|
||||||
|
"\n",
|
||||||
|
"# We pick a random validation set to sample nearest neighbors. Here we limit the\n",
|
||||||
|
"# validation samples to the words that have a low numeric ID, which by\n",
|
||||||
|
"# construction are also the most frequent.\n",
|
||||||
|
"valid_size = 16 # Random set of words to evaluate similarity on.\n",
|
||||||
|
"valid_window = 100 # Only pick dev samples in the head of the distribution.\n",
|
||||||
|
"valid_examples = rnd.choice(valid_window, valid_size, replace=False)\n",
|
||||||
|
"num_sampled = 64 # Number of negative examples to sample.\n",
|
||||||
|
"\n",
|
||||||
|
"learning_rate = 0.01"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 64,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"tf.reset_default_graph()\n",
|
||||||
|
"\n",
|
||||||
|
"# Input data.\n",
|
||||||
|
"train_inputs = tf.placeholder(tf.int32, shape=[batch_size])\n",
|
||||||
|
"train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])\n",
|
||||||
|
"valid_dataset = tf.constant(valid_examples, dtype=tf.int32)\n",
|
||||||
|
"\n",
|
||||||
|
"# Look up embeddings for inputs.\n",
|
||||||
|
"init_embeddings = tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0)\n",
|
||||||
|
"embeddings = tf.Variable(init_embeddings)\n",
|
||||||
|
"embed = tf.nn.embedding_lookup(embeddings, train_inputs)\n",
|
||||||
|
"\n",
|
||||||
|
"# Construct the variables for the NCE loss\n",
|
||||||
|
"nce_weights = tf.Variable(\n",
|
||||||
|
" tf.truncated_normal([vocabulary_size, embedding_size],\n",
|
||||||
|
" stddev=1.0 / np.sqrt(embedding_size)))\n",
|
||||||
|
"nce_biases = tf.Variable(tf.zeros([vocabulary_size]))\n",
|
||||||
|
"\n",
|
||||||
|
"# Compute the average NCE loss for the batch.\n",
|
||||||
|
"# tf.nce_loss automatically draws a new sample of the negative labels each\n",
|
||||||
|
"# time we evaluate the loss.\n",
|
||||||
|
"loss = tf.reduce_mean(\n",
|
||||||
|
" tf.nn.nce_loss(nce_weights, nce_biases, embed, train_labels,\n",
|
||||||
|
" num_sampled, vocabulary_size))\n",
|
||||||
|
"\n",
|
||||||
|
"# Construct the Adam optimizer\n",
|
||||||
|
"optimizer = tf.train.AdamOptimizer(learning_rate)\n",
|
||||||
|
"training_op = optimizer.minimize(loss)\n",
|
||||||
|
"\n",
|
||||||
|
"# Compute the cosine similarity between minibatch examples and all embeddings.\n",
|
||||||
|
"norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), reduction_indices=1, keep_dims=True))\n",
|
||||||
|
"normalized_embeddings = embeddings / norm\n",
|
||||||
|
"valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)\n",
|
||||||
|
"similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True)\n",
|
||||||
|
"\n",
|
||||||
|
"# Add variable initializer.\n",
|
||||||
|
"init = tf.initialize_all_variables()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Train the model"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 65,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"num_steps = 100001\n",
|
||||||
|
"\n",
|
||||||
|
"with tf.Session() as session:\n",
|
||||||
|
" init.run()\n",
|
||||||
|
"\n",
|
||||||
|
" average_loss = 0\n",
|
||||||
|
" for step in range(num_steps):\n",
|
||||||
|
" print(\"\\rIteration: {}\".format(step), end=\"\\t\")\n",
|
||||||
|
" batch_inputs, batch_labels = generate_batch(batch_size, num_skips, skip_window)\n",
|
||||||
|
" feed_dict = {train_inputs : batch_inputs, train_labels : batch_labels}\n",
|
||||||
|
"\n",
|
||||||
|
" # We perform one update step by evaluating the training op (including it\n",
|
||||||
|
" # in the list of returned values for session.run()\n",
|
||||||
|
" _, loss_val = session.run([training_op, loss], feed_dict=feed_dict)\n",
|
||||||
|
" average_loss += loss_val\n",
|
||||||
|
"\n",
|
||||||
|
" if step % 2000 == 0:\n",
|
||||||
|
" if step > 0:\n",
|
||||||
|
" average_loss /= 2000\n",
|
||||||
|
" # The average loss is an estimate of the loss over the last 2000 batches.\n",
|
||||||
|
" print(\"Average loss at step \", step, \": \", average_loss)\n",
|
||||||
|
" average_loss = 0\n",
|
||||||
|
"\n",
|
||||||
|
" # Note that this is expensive (~20% slowdown if computed every 500 steps)\n",
|
||||||
|
" if step % 10000 == 0:\n",
|
||||||
|
" sim = similarity.eval()\n",
|
||||||
|
" for i in range(valid_size):\n",
|
||||||
|
" valid_word = vocabulary[valid_examples[i]]\n",
|
||||||
|
" top_k = 8 # number of nearest neighbors\n",
|
||||||
|
" nearest = (-sim[i, :]).argsort()[1:top_k+1]\n",
|
||||||
|
" log_str = \"Nearest to %s:\" % valid_word\n",
|
||||||
|
" for k in range(top_k):\n",
|
||||||
|
" close_word = vocabulary[nearest[k]]\n",
|
||||||
|
" log_str = \"%s %s,\" % (log_str, close_word)\n",
|
||||||
|
" print(log_str)\n",
|
||||||
|
"\n",
|
||||||
|
" final_embeddings = normalized_embeddings.eval()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Let's save the final embeddings (of course you can use a TensorFlow `Saver` if you prefer):"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 66,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"np.save(\"my_final_embeddings.npy\", final_embeddings)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Plot the embeddings"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 67,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": true
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def plot_with_labels(low_dim_embs, labels):\n",
|
||||||
|
" assert low_dim_embs.shape[0] >= len(labels), \"More labels than embeddings\"\n",
|
||||||
|
" plt.figure(figsize=(18, 18)) #in inches\n",
|
||||||
|
" for i, label in enumerate(labels):\n",
|
||||||
|
" x, y = low_dim_embs[i,:]\n",
|
||||||
|
" plt.scatter(x, y)\n",
|
||||||
|
" plt.annotate(label,\n",
|
||||||
|
" xy=(x, y),\n",
|
||||||
|
" xytext=(5, 2),\n",
|
||||||
|
" textcoords='offset points',\n",
|
||||||
|
" ha='right',\n",
|
||||||
|
" va='bottom')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 68,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from sklearn.manifold import TSNE\n",
|
||||||
|
"\n",
|
||||||
|
"tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)\n",
|
||||||
|
"plot_only = 500\n",
|
||||||
|
"low_dim_embs = tsne.fit_transform(final_embeddings[:plot_only,:])\n",
|
||||||
|
"labels = [vocabulary[i] for i in range(plot_only)]\n",
|
||||||
|
"plot_with_labels(low_dim_embs, labels)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Machine Translation"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"The `basic_rnn_seq2seq()` function creates a simple Encoder/Decoder model: it first runs an RNN to encode `encoder_inputs` into a state vector, then runs a decoder initialized with the last encoder state on `decoder_inputs`. Encoder and decoder use the same RNN cell type but they don't share parameters."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 69,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import tensorflow as tf\n",
|
||||||
|
"tf.reset_default_graph()\n",
|
||||||
|
"\n",
|
||||||
|
"n_steps = 50\n",
|
||||||
|
"n_neurons = 200\n",
|
||||||
|
"n_layers = 3\n",
|
||||||
|
"num_encoder_symbols = 20000\n",
|
||||||
|
"num_decoder_symbols = 20000\n",
|
||||||
|
"embedding_size = 150\n",
|
||||||
|
"learning_rate = 0.01\n",
|
||||||
|
"\n",
|
||||||
|
"X = tf.placeholder(tf.int32, [None, n_steps]) # English sentences\n",
|
||||||
|
"Y = tf.placeholder(tf.int32, [None, n_steps]) # French translations\n",
|
||||||
|
"W = tf.placeholder(tf.float32, [None, n_steps - 1, 1])\n",
|
||||||
|
"Y_input = Y[:, :-1]\n",
|
||||||
|
"Y_target = Y[:, 1:]\n",
|
||||||
|
"\n",
|
||||||
|
"encoder_inputs = tf.unpack(tf.transpose(X)) # list of 1D tensors\n",
|
||||||
|
"decoder_inputs = tf.unpack(tf.transpose(Y_input)) # list of 1D tensors\n",
|
||||||
|
"\n",
|
||||||
|
"lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units=n_neurons)\n",
|
||||||
|
"cell = tf.nn.rnn_cell.MultiRNNCell([lstm_cell] * n_layers)\n",
|
||||||
|
"\n",
|
||||||
|
"output_seqs, states = tf.nn.seq2seq.embedding_rnn_seq2seq(\n",
|
||||||
|
" encoder_inputs,\n",
|
||||||
|
" decoder_inputs,\n",
|
||||||
|
" cell,\n",
|
||||||
|
" num_encoder_symbols,\n",
|
||||||
|
" num_decoder_symbols,\n",
|
||||||
|
" embedding_size)\n",
|
||||||
|
"\n",
|
||||||
|
"logits = tf.transpose(tf.pack(output_seqs), perm=[1, 0, 2])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 70,
|
||||||
|
"metadata": {
|
||||||
|
"collapsed": false
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"logits_flat = tf.reshape(logits, [-1, num_decoder_symbols])\n",
|
||||||
|
"Y_target_flat = tf.reshape(Y_target, [-1])\n",
|
||||||
|
"W_flat = tf.reshape(W, [-1])\n",
|
||||||
|
"xentropy = W_flat * tf.nn.sparse_softmax_cross_entropy_with_logits(logits_flat, Y_target_flat)\n",
|
||||||
|
"loss = tf.reduce_mean(xentropy)\n",
|
||||||
|
"optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)\n",
|
||||||
|
"training_op = optimizer.minimize(loss)\n",
|
||||||
|
"\n",
|
||||||
|
"init = tf.initialize_all_variables()"
|
||||||
|
]
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {
|
"metadata": {
|
||||||
|
@ -1281,15 +1757,6 @@
|
||||||
"source": [
|
"source": [
|
||||||
"**Coming soon**"
|
"**Coming soon**"
|
||||||
]
|
]
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {
|
|
||||||
"collapsed": true
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": []
|
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"metadata": {
|
"metadata": {
|
||||||
|
@ -1308,7 +1775,7 @@
|
||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.5.1"
|
"version": "3.5.2"
|
||||||
},
|
},
|
||||||
"nav_menu": {},
|
"nav_menu": {},
|
||||||
"toc": {
|
"toc": {
|
||||||
|
|
Loading…
Reference in New Issue