From df9b91e2e806aa9de6f789dbee8afe2588bdacc2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Aur=C3=A9lien=20Geron?= <ageron@users.noreply.github.com>
Date: Tue, 16 Apr 2019 20:39:14 +0800
Subject: [PATCH] Rename chapters 11 to 15 and split chapter 15 into 15 and 16

---
 ... => 11_training_deep_neural_networks.ipynb |    2 +-
 ..._models_and_training_with_tensorflow.ipynb |    4 +-
 13_loading_and_preprocessing_data.ipynb       |    2 +-
 ...=> 14_deep_computer_vision_with_cnns.ipynb |    2 +-
 ...essing_sequences_using_rnns_and_cnns.ipynb | 1102 +--------------
 16_nlp_with_rnns_and_attention.ipynb          | 1228 +++++++++++++++++
 6 files changed, 1240 insertions(+), 1100 deletions(-)
 rename 11_deep_learning.ipynb => 11_training_deep_neural_networks.ipynb (99%)
 rename 12_custom_models_with_tensorflow_2.ipynb => 12_custom_models_and_training_with_tensorflow.ipynb (99%)
 rename 14_convolutional_neural_networks.ipynb => 14_deep_computer_vision_with_cnns.ipynb (99%)
 rename 15_recurrent_neural_networks.ipynb => 15_processing_sequences_using_rnns_and_cnns.ipynb (52%)
 create mode 100644 16_nlp_with_rnns_and_attention.ipynb

diff --git a/11_deep_learning.ipynb b/11_training_deep_neural_networks.ipynb
similarity index 99%
rename from 11_deep_learning.ipynb
rename to 11_training_deep_neural_networks.ipynb
index b0c4eec..0e8ed3b 100644
--- a/11_deep_learning.ipynb
+++ b/11_training_deep_neural_networks.ipynb
@@ -4,7 +4,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "**Chapter 11 – Deep Learning**"
+    "**Chapter 11 – Training Deep Neural Networks**"
    ]
   },
   {
diff --git a/12_custom_models_with_tensorflow_2.ipynb b/12_custom_models_and_training_with_tensorflow.ipynb
similarity index 99%
rename from 12_custom_models_with_tensorflow_2.ipynb
rename to 12_custom_models_and_training_with_tensorflow.ipynb
index 895719d..887c5c7 100644
--- a/12_custom_models_with_tensorflow_2.ipynb
+++ b/12_custom_models_and_training_with_tensorflow.ipynb
@@ -4,14 +4,14 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Custom Models and Training with TensorFlow 2"
+    "**Chapter 12 – Custom Models and Training with TensorFlow**"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "_This notebook contains all the sample code and solutions to the exercises in chapter 12._"
+    "_This notebook contains all the sample code in chapter 12._"
    ]
   },
   {
diff --git a/13_loading_and_preprocessing_data.ipynb b/13_loading_and_preprocessing_data.ipynb
index 901c074..997e458 100644
--- a/13_loading_and_preprocessing_data.ipynb
+++ b/13_loading_and_preprocessing_data.ipynb
@@ -4,7 +4,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "**Chapter 13 – Loading and Preprocessing Data**\n",
+    "**Chapter 13 – Loading and Preprocessing Data with TensorFlow**\n",
     "\n",
     "_This notebook contains all the sample code and solutions to the exercises in chapter 13._"
    ]
diff --git a/14_convolutional_neural_networks.ipynb b/14_deep_computer_vision_with_cnns.ipynb
similarity index 99%
rename from 14_convolutional_neural_networks.ipynb
rename to 14_deep_computer_vision_with_cnns.ipynb
index bbbfb33..6a7ee6a 100644
--- a/14_convolutional_neural_networks.ipynb
+++ b/14_deep_computer_vision_with_cnns.ipynb
@@ -4,7 +4,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "**Chapter 14 – Convolutional Neural Networks**"
+    "**Chapter 14 – Deep Computer Vision Using Convolutional Neural Networks**"
    ]
   },
   {
diff --git a/15_recurrent_neural_networks.ipynb b/15_processing_sequences_using_rnns_and_cnns.ipynb
similarity index 52%
rename from 15_recurrent_neural_networks.ipynb
rename to 15_processing_sequences_using_rnns_and_cnns.ipynb
index e884fc3..07c979e 100644
--- a/15_recurrent_neural_networks.ipynb
+++ b/15_processing_sequences_using_rnns_and_cnns.ipynb
@@ -4,7 +4,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "**Chapter 15 – Recurrent Neural Networks**"
+    "**Chapter 15 – Processing Sequences Using RNNs and CNNs**"
    ]
   },
   {
@@ -1107,1095 +1107,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Splitting a sequence into batches of shuffled windows"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "For example, let's split the sequence 0 to 14 into windows of length 5, each shifted by 2 (e.g.,`[0, 1, 2, 3, 4]`, `[2, 3, 4, 5, 6]`, etc.), then shuffle them, and split them into inputs (the first 4 steps) and targets (the last 4 steps) (e.g., `[2, 3, 4, 5, 6]` would be split into `[[2, 3, 4, 5], [3, 4, 5, 6]]`), then create batches of 3 such input/target pairs:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 58,
-   "metadata": {
-    "scrolled": true
-   },
-   "outputs": [],
-   "source": [
-    "np.random.seed(42)\n",
-    "tf.random.set_seed(42)\n",
-    "\n",
-    "n_steps = 5\n",
-    "dataset = tf.data.Dataset.from_tensor_slices(tf.range(15))\n",
-    "dataset = dataset.window(n_steps, shift=2, drop_remainder=True)\n",
-    "dataset = dataset.flat_map(lambda window: window.batch(n_steps))\n",
-    "dataset = dataset.shuffle(10).map(lambda window: (window[:-1], window[1:]))\n",
-    "dataset = dataset.batch(3).prefetch(1)\n",
-    "for index, (X_batch, Y_batch) in enumerate(dataset):\n",
-    "    print(\"_\" * 20, \"Batch\", index, \"\\nX_batch\")\n",
-    "    print(X_batch.numpy())\n",
-    "    print(\"=\" * 5, \"\\nY_batch\")\n",
-    "    print(Y_batch.numpy())"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Char-RNN"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 59,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "shakespeare_url = \"https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt\"\n",
-    "filepath = keras.utils.get_file(\"shakespeare.txt\", shakespeare_url)\n",
-    "with open(filepath) as f:\n",
-    "    shakespeare_text = f.read()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 60,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "print(shakespeare_text[:148])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 61,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "\"\".join(sorted(set(shakespeare_text.lower())))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 62,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "tokenizer = keras.preprocessing.text.Tokenizer(char_level=True)\n",
-    "tokenizer.fit_on_texts(shakespeare_text)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 63,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "tokenizer.texts_to_sequences([\"First\"])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 64,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "tokenizer.sequences_to_texts([[20, 6, 9, 8, 3]])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 65,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "max_id = len(tokenizer.word_index) # number of distinct characters\n",
-    "dataset_size = tokenizer.document_count # total number of characters"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 66,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "[encoded] = np.array(tokenizer.texts_to_sequences([shakespeare_text])) - 1\n",
-    "train_size = dataset_size * 90 // 100\n",
-    "dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 67,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "n_steps = 100\n",
-    "window_length = n_steps + 1 # target = input shifted 1 character ahead\n",
-    "dataset = dataset.repeat().window(window_length, shift=1, drop_remainder=True)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 68,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "dataset = dataset.flat_map(lambda window: window.batch(window_length))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 69,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "np.random.seed(42)\n",
-    "tf.random.set_seed(42)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 70,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "batch_size = 32\n",
-    "dataset = dataset.shuffle(10000).batch(batch_size)\n",
-    "dataset = dataset.map(lambda windows: (windows[:, :-1], windows[:, 1:]))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 71,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "dataset = dataset.map(\n",
-    "    lambda X_batch, Y_batch: (tf.one_hot(X_batch, depth=max_id), Y_batch))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 72,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "dataset = dataset.prefetch(1)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 73,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "for X_batch, Y_batch in dataset.take(1):\n",
-    "    print(X_batch.shape, Y_batch.shape)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 74,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "model = keras.models.Sequential([\n",
-    "    keras.layers.GRU(128, return_sequences=True, input_shape=[None, max_id],\n",
-    "                     dropout=0.2, recurrent_dropout=0.2),\n",
-    "    keras.layers.GRU(128, return_sequences=True,\n",
-    "                     dropout=0.2, recurrent_dropout=0.2),\n",
-    "    keras.layers.TimeDistributed(keras.layers.Dense(max_id,\n",
-    "                                                    activation=\"softmax\"))\n",
-    "])\n",
-    "model.compile(loss=\"sparse_categorical_crossentropy\", optimizer=\"adam\")\n",
-    "history = model.fit(dataset, steps_per_epoch=train_size // batch_size,\n",
-    "                    epochs=10)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 75,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def preprocess(texts):\n",
-    "    X = np.array(tokenizer.texts_to_sequences(texts)) - 1\n",
-    "    return tf.one_hot(X, max_id)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 76,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "X_new = preprocess([\"How are yo\"])\n",
-    "Y_pred = model.predict_classes(X_new)\n",
-    "tokenizer.sequences_to_texts(Y_pred + 1)[0][-1] # 1st sentence, last char"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 77,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "tf.random.set_seed(42)\n",
-    "\n",
-    "tf.random.categorical([[np.log(0.5), np.log(0.4), np.log(0.1)]], num_samples=40).numpy()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 78,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def next_char(text, temperature=1):\n",
-    "    X_new = preprocess([text])\n",
-    "    y_proba = model.predict(X_new)[0, -1:, :]\n",
-    "    rescaled_logits = tf.math.log(y_proba) / temperature\n",
-    "    char_id = tf.random.categorical(rescaled_logits, num_samples=1) + 1\n",
-    "    return tokenizer.sequences_to_texts(char_id.numpy())[0]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 79,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "tf.random.set_seed(42)\n",
-    "\n",
-    "next_char(\"How are yo\", temperature=1)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 80,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def complete_text(text, n_chars=50, temperature=1):\n",
-    "    for _ in range(n_chars):\n",
-    "        text += next_char(text, temperature)\n",
-    "    return text"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 81,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "tf.random.set_seed(42)\n",
-    "\n",
-    "print(complete_text(\"t\", temperature=0.2))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 82,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "print(complete_text(\"t\", temperature=1))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 83,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "print(complete_text(\"t\", temperature=2))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Stateful RNN"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 84,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "tf.random.set_seed(42)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 85,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size])\n",
-    "dataset = dataset.window(window_length, shift=n_steps, drop_remainder=True)\n",
-    "dataset = dataset.flat_map(lambda window: window.batch(window_length))\n",
-    "dataset = dataset.repeat().batch(1)\n",
-    "dataset = dataset.map(lambda windows: (windows[:, :-1], windows[:, 1:]))\n",
-    "dataset = dataset.map(\n",
-    "    lambda X_batch, Y_batch: (tf.one_hot(X_batch, depth=max_id), Y_batch))\n",
-    "dataset = dataset.prefetch(1)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 86,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "batch_size = 32\n",
-    "encoded_parts = np.array_split(encoded[:train_size], batch_size)\n",
-    "datasets = []\n",
-    "for encoded_part in encoded_parts:\n",
-    "    dataset = tf.data.Dataset.from_tensor_slices(encoded_part)\n",
-    "    dataset = dataset.window(window_length, shift=n_steps, drop_remainder=True)\n",
-    "    dataset = dataset.flat_map(lambda window: window.batch(window_length))\n",
-    "    datasets.append(dataset)\n",
-    "dataset = tf.data.Dataset.zip(tuple(datasets)).map(lambda *windows: tf.stack(windows))\n",
-    "dataset = dataset.repeat().map(lambda windows: (windows[:, :-1], windows[:, 1:]))\n",
-    "dataset = dataset.map(\n",
-    "    lambda X_batch, Y_batch: (tf.one_hot(X_batch, depth=max_id), Y_batch))\n",
-    "dataset = dataset.prefetch(1)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 87,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "model = keras.models.Sequential([\n",
-    "    keras.layers.GRU(128, return_sequences=True, stateful=True,\n",
-    "#                     dropout=0.2, recurrent_dropout=0.2, # see TF issue #27829\n",
-    "                     batch_input_shape=[batch_size, None, max_id]),\n",
-    "    keras.layers.GRU(128, return_sequences=True, stateful=True\n",
-    "#                     dropout=0.2, recurrent_dropout=0.2 # see TF issue #27829\n",
-    "                    ),\n",
-    "    keras.layers.TimeDistributed(keras.layers.Dense(max_id,\n",
-    "                                                    activation=\"softmax\"))\n",
-    "])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 88,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "class ResetStatesCallback(keras.callbacks.Callback):\n",
-    "    def on_epoch_begin(self, epoch, logs):\n",
-    "        self.model.reset_states()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 89,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "model.compile(loss=\"sparse_categorical_crossentropy\", optimizer=\"adam\")\n",
-    "steps_per_epoch = train_size // batch_size // n_steps\n",
-    "model.fit(dataset, steps_per_epoch=steps_per_epoch, epochs=50,\n",
-    "                   callbacks=[ResetStatesCallback()])"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "To use the model with different batch sizes, we need to create a stateless copy. We can get rid of dropout since it is only used during training:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 90,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "stateless_model = keras.models.Sequential([\n",
-    "    keras.layers.GRU(128, return_sequences=True, input_shape=[None, max_id]),\n",
-    "    keras.layers.GRU(128, return_sequences=True),\n",
-    "    keras.layers.TimeDistributed(keras.layers.Dense(max_id,\n",
-    "                                                    activation=\"softmax\"))\n",
-    "])"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "To set the weights, we first need to build the model (so the weights get created):"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 91,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "stateless_model.build(tf.TensorShape([None, None, max_id]))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 92,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "stateless_model.set_weights(model.get_weights())\n",
-    "model = stateless_model"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 93,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "tf.random.set_seed(42)\n",
-    "\n",
-    "print(complete_text(\"t\"))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Sentiment Analysis"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 94,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "tf.random.set_seed(42)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "You can load the IMDB dataset easily:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 95,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "(X_train, y_test), (X_valid, y_test) = keras.datasets.imdb.load_data()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 96,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "X_train[0][:10]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 97,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "word_index = keras.datasets.imdb.get_word_index()\n",
-    "id_to_word = {id_ + 3: word for word, id_ in word_index.items()}\n",
-    "for id_, token in enumerate((\"<pad>\", \"<sos>\", \"<unk>\")):\n",
-    "    id_to_word[id_] = token\n",
-    "\" \".join([id_to_word[id_] for id_ in X_train[0][:10]])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 98,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import tensorflow_datasets as tfds\n",
-    "\n",
-    "datasets, info = tfds.load(\"imdb_reviews\", as_supervised=True, with_info=True)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 99,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "datasets.keys()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 100,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "train_size = info.splits[\"train\"].num_examples\n",
-    "test_size = info.splits[\"test\"].num_examples"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 101,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "train_size, test_size"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 102,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "for X_batch, y_batch in datasets[\"train\"].batch(2).take(1):\n",
-    "    for review, label in zip(X_batch.numpy(), y_batch.numpy()):\n",
-    "        print(\"Review:\", review.decode(\"utf-8\")[:200], \"...\")\n",
-    "        print(\"Label:\", label, \"= Positive\" if label else \"= Negative\")\n",
-    "        print()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 103,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def preprocess(X_batch, y_batch):\n",
-    "    X_batch = tf.strings.substr(X_batch, 0, 300)\n",
-    "    X_batch = tf.strings.regex_replace(X_batch, rb\"<br\\s*/?>\", b\" \")\n",
-    "    X_batch = tf.strings.regex_replace(X_batch, b\"[^a-zA-Z']\", b\" \")\n",
-    "    X_batch = tf.strings.split(X_batch)\n",
-    "    return X_batch.to_tensor(default_value=b\"<pad>\"), y_batch"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 104,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "preprocess(X_batch, y_batch)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 105,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from collections import Counter\n",
-    "\n",
-    "vocabulary = Counter()\n",
-    "for X_batch, y_batch in datasets[\"train\"].batch(32).map(preprocess):\n",
-    "    for review in X_batch:\n",
-    "        vocabulary.update(list(review.numpy()))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 106,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "vocabulary.most_common()[:3]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 107,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "len(vocabulary)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 108,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "vocab_size = 10000\n",
-    "truncated_vocabulary = [\n",
-    "    word for word, count in vocabulary.most_common()[:vocab_size]]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 109,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "word_to_id = {word: index for index, word in enumerate(truncated_vocabulary)}\n",
-    "for word in b\"This movie was faaaaaantastic\".split():\n",
-    "    print(word_to_id.get(word) or vocab_size)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 110,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "words = tf.constant(truncated_vocabulary)\n",
-    "word_ids = tf.range(len(truncated_vocabulary), dtype=tf.int64)\n",
-    "vocab_init = tf.lookup.KeyValueTensorInitializer(words, word_ids)\n",
-    "num_oov_buckets = 1000\n",
-    "table = tf.lookup.StaticVocabularyTable(vocab_init, num_oov_buckets)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 111,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "table.lookup(tf.constant([b\"This movie was faaaaaantastic\".split()]))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 112,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def encode_words(X_batch, y_batch):\n",
-    "    return table.lookup(X_batch), y_batch\n",
-    "\n",
-    "train_set = datasets[\"train\"].repeat().batch(32).map(preprocess)\n",
-    "train_set = train_set.map(encode_words).prefetch(1)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 113,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "for X_batch, y_batch in train_set.take(1):\n",
-    "    print(X_batch)\n",
-    "    print(y_batch)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 114,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "embed_size = 128\n",
-    "model = keras.models.Sequential([\n",
-    "    keras.layers.Embedding(vocab_size + num_oov_buckets, embed_size,\n",
-    "                           mask_zero=True, # not shown in the book\n",
-    "                           input_shape=[None]),\n",
-    "    keras.layers.GRU(128, return_sequences=True),\n",
-    "    keras.layers.GRU(128),\n",
-    "    keras.layers.Dense(1, activation=\"sigmoid\")\n",
-    "])\n",
-    "model.compile(loss=\"binary_crossentropy\", optimizer=\"adam\", metrics=[\"accuracy\"])\n",
-    "history = model.fit(train_set, steps_per_epoch=train_size // 32, epochs=5)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Or using manual masking:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 115,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "K = keras.backend\n",
-    "embed_size = 128\n",
-    "inputs = keras.layers.Input(shape=[None])\n",
-    "mask = keras.layers.Lambda(lambda inputs: K.not_equal(inputs, 0))(inputs)\n",
-    "z = keras.layers.Embedding(vocab_size + num_oov_buckets, embed_size)(inputs)\n",
-    "z = keras.layers.GRU(128, return_sequences=True)(z, mask=mask)\n",
-    "z = keras.layers.GRU(128)(z, mask=mask)\n",
-    "outputs = keras.layers.Dense(1, activation=\"sigmoid\")(z)\n",
-    "model = keras.models.Model(inputs=[inputs], outputs=[outputs])\n",
-    "model.compile(loss=\"binary_crossentropy\", optimizer=\"adam\", metrics=[\"accuracy\"])\n",
-    "history = model.fit(train_set, steps_per_epoch=train_size // 32, epochs=5)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Reusing Pretrained Embeddings"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 116,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "tf.random.set_seed(42)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 117,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "TFHUB_CACHE_DIR = os.path.join(os.curdir, \"my_tfhub_cache\")\n",
-    "os.environ[\"TFHUB_CACHE_DIR\"] = TFHUB_CACHE_DIR"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 118,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import tensorflow_hub as hub\n",
-    "\n",
-    "model = keras.Sequential([\n",
-    "    hub.KerasLayer(\"https://tfhub.dev/google/tf2-preview/nnlm-en-dim50/1\",\n",
-    "                   dtype=tf.string, input_shape=[], output_shape=[50]),\n",
-    "    keras.layers.Dense(128, activation=\"relu\"),\n",
-    "    keras.layers.Dense(1, activation=\"sigmoid\")\n",
-    "])\n",
-    "model.compile(loss=\"binary_crossentropy\", optimizer=\"adam\",\n",
-    "              metrics=[\"accuracy\"])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 119,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "for dirpath, dirnames, filenames in os.walk(TFHUB_CACHE_DIR):\n",
-    "    for filename in filenames:\n",
-    "        print(os.path.join(dirpath, filename))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 120,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import tensorflow_datasets as tfds\n",
-    "\n",
-    "datasets, info = tfds.load(\"imdb_reviews\", as_supervised=True, with_info=True)\n",
-    "train_size = info.splits[\"train\"].num_examples\n",
-    "batch_size = 32\n",
-    "train_set = datasets[\"train\"].repeat().batch(batch_size).prefetch(1)\n",
-    "history = model.fit(train_set, steps_per_epoch=train_size // batch_size, epochs=5)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Automatic Translation"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 121,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "tf.random.set_seed(42)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 122,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "vocab_size = 100\n",
-    "embed_size = 10"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 123,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import tensorflow_addons as tfa\n",
-    "\n",
-    "encoder_inputs = keras.layers.Input(shape=[None], dtype=np.int32)\n",
-    "decoder_inputs = keras.layers.Input(shape=[None], dtype=np.int32)\n",
-    "sequence_lengths = keras.layers.Input(shape=[], dtype=np.int32)\n",
-    "\n",
-    "embeddings = keras.layers.Embedding(vocab_size, embed_size)\n",
-    "encoder_embeddings = embeddings(encoder_inputs)\n",
-    "decoder_embeddings = embeddings(decoder_inputs)\n",
-    "\n",
-    "encoder = keras.layers.LSTM(512, return_state=True)\n",
-    "encoder_outputs, state_h, state_c = encoder(encoder_embeddings)\n",
-    "encoder_state = [state_h, state_c]\n",
-    "\n",
-    "sampler = tfa.seq2seq.sampler.TrainingSampler()\n",
-    "\n",
-    "decoder_cell = keras.layers.LSTMCell(512)\n",
-    "output_layer = keras.layers.Dense(vocab_size)\n",
-    "decoder = tfa.seq2seq.basic_decoder.BasicDecoder(decoder_cell, sampler,\n",
-    "                                                 output_layer=output_layer)\n",
-    "final_outputs, final_state, final_sequence_lengths = decoder(\n",
-    "    decoder_embeddings, initial_state=encoder_state,\n",
-    "    sequence_length=sequence_lengths)\n",
-    "Y_proba = tf.nn.softmax(final_outputs.rnn_output)\n",
-    "\n",
-    "model = keras.models.Model(\n",
-    "    inputs=[encoder_inputs, decoder_inputs, sequence_lengths],\n",
-    "    outputs=[Y_proba])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 124,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "model.compile(loss=\"sparse_categorical_crossentropy\", optimizer=\"adam\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 125,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "X = np.random.randint(100, size=10*1000).reshape(1000, 10)\n",
-    "Y = np.random.randint(100, size=15*1000).reshape(1000, 15)\n",
-    "X_decoder = np.c_[np.zeros((1000, 1)), Y[:, :-1]]\n",
-    "seq_lengths = np.full([1000], 15)\n",
-    "\n",
-    "history = model.fit([X, X_decoder, seq_lengths], Y, epochs=2)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Bidirectional Recurrent Layers"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 126,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "model = keras.models.Sequential([\n",
-    "    keras.layers.GRU(10, return_sequences=True, input_shape=[None, 10]),\n",
-    "    keras.layers.Bidirectional(keras.layers.GRU(10, return_sequences=True))\n",
-    "])\n",
-    "\n",
-    "model.summary()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Positional Encoding"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 127,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "class PositionalEncoding(keras.layers.Layer):\n",
-    "    def __init__(self, max_steps, max_dims, dtype=tf.float32, **kwargs):\n",
-    "        super().__init__(dtype=dtype, **kwargs)\n",
-    "        if max_dims % 2 == 1: max_dims += 1 # max_dims must be even\n",
-    "        p, i = np.meshgrid(np.arange(max_steps), np.arange(max_dims // 2))\n",
-    "        pos_emb = np.empty((1, max_steps, max_dims))\n",
-    "        pos_emb[0, :, ::2] = np.sin(p / 10000**(2 * i / max_dims)).T\n",
-    "        pos_emb[0, :, 1::2] = np.cos(p / 10000**(2 * i / max_dims)).T\n",
-    "        self.positional_embedding = tf.constant(pos_emb.astype(self.dtype))\n",
-    "    def call(self, inputs):\n",
-    "        shape = tf.shape(inputs)\n",
-    "        return inputs + self.positional_embedding[:, :shape[-2], :shape[-1]]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 128,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "max_steps = 201\n",
-    "max_dims = 512\n",
-    "pos_emb = PositionalEncoding(max_steps, max_dims)\n",
-    "PE = pos_emb(np.zeros((1, max_steps, max_dims), np.float32))[0].numpy()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 129,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "i1, i2, crop_i = 100, 101, 150\n",
-    "p1, p2, p3 = 22, 60, 35\n",
-    "fig, (ax1, ax2) = plt.subplots(nrows=2, ncols=1, sharex=True, figsize=(9, 5))\n",
-    "ax1.plot([p1, p1], [-1, 1], \"k--\", label=\"$p = {}$\".format(p1))\n",
-    "ax1.plot([p2, p2], [-1, 1], \"k--\", label=\"$p = {}$\".format(p2), alpha=0.5)\n",
-    "ax1.plot(p3, PE[p3, i1], \"bx\", label=\"$p = {}$\".format(p3))\n",
-    "ax1.plot(PE[:,i1], \"b-\", label=\"$i = {}$\".format(i1))\n",
-    "ax1.plot(PE[:,i2], \"r-\", label=\"$i = {}$\".format(i2))\n",
-    "ax1.plot([p1, p2], [PE[p1, i1], PE[p2, i1]], \"bo\")\n",
-    "ax1.plot([p1, p2], [PE[p1, i2], PE[p2, i2]], \"ro\")\n",
-    "ax1.legend(loc=\"center right\", fontsize=14, framealpha=0.95)\n",
-    "ax1.set_ylabel(\"$P_{(p,i)}$\", rotation=0, fontsize=16)\n",
-    "ax1.grid(True, alpha=0.3)\n",
-    "ax1.hlines(0, 0, max_steps - 1, color=\"k\", linewidth=1, alpha=0.3)\n",
-    "ax1.axis([0, max_steps - 1, -1, 1])\n",
-    "ax2.imshow(PE.T[:crop_i], cmap=\"gray\", interpolation=\"bilinear\", aspect=\"auto\")\n",
-    "ax2.hlines(i1, 0, max_steps - 1, color=\"b\")\n",
-    "cheat = 2 # need to raise the red line a bit, or else it hides the blue one\n",
-    "ax2.hlines(i2+cheat, 0, max_steps - 1, color=\"r\")\n",
-    "ax2.plot([p1, p1], [0, crop_i], \"k--\")\n",
-    "ax2.plot([p2, p2], [0, crop_i], \"k--\", alpha=0.5)\n",
-    "ax2.plot([p1, p2], [i2+cheat, i2+cheat], \"ro\")\n",
-    "ax2.plot([p1, p2], [i1, i1], \"bo\")\n",
-    "ax2.axis([0, max_steps - 1, 0, crop_i])\n",
-    "ax2.set_xlabel(\"$p$\", fontsize=16)\n",
-    "ax2.set_ylabel(\"$i$\", rotation=0, fontsize=16)\n",
-    "plt.savefig(\"positional_embedding_plot\")\n",
-    "plt.show()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 130,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "embed_size = 512; max_steps = 500; vocab_size = 10000\n",
-    "encoder_inputs = keras.layers.Input(shape=[None], dtype=np.int32)\n",
-    "decoder_inputs = keras.layers.Input(shape=[None], dtype=np.int32)\n",
-    "embeddings = keras.layers.Embedding(vocab_size, embed_size)\n",
-    "encoder_embeddings = embeddings(encoder_inputs)\n",
-    "decoder_embeddings = embeddings(decoder_inputs)\n",
-    "positional_encoding = PositionalEncoding(max_steps, max_dims=embed_size)\n",
-    "encoder_in = positional_encoding(encoder_embeddings)\n",
-    "decoder_in = positional_encoding(decoder_embeddings)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Here is a (very) simplified Transformer (the actual architecture has skip connections, layer norm, dense nets, and most importantly it uses Multi-Head Attention instead of regular Attention):"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 131,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "for N in range(6):\n",
-    "    encoder_attn = keras.layers.Attention(use_scale=True)\n",
-    "    encoder_in = encoder_attn([encoder_in, encoder_in])\n",
-    "    masked_decoder_attn = keras.layers.Attention(use_scale=True, causal=True)\n",
-    "    decoder_in = masked_decoder_attn([decoder_in, decoder_in])\n",
-    "    decoder_attn = keras.layers.Attention(use_scale=True)\n",
-    "    final_enc = decoder_attn([decoder_in, encoder_in])\n",
-    "\n",
-    "output_layer = keras.layers.TimeDistributed(\n",
-    "    keras.layers.Dense(vocab_size, activation=\"softmax\"))\n",
-    "outputs = output_layer(final_enc)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Here's a basic implementation of the `MultiHeadAttention` layer. One will likely be added to `keras.layers` in the near future. Note that `Conv1D` layers with `kernel_size=1` (and the default `padding=\"valid\"` and `strides=1`) is equivalent to a `TimeDistributed(Dense(...))` layer."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 132,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "K = keras.backend\n",
-    "\n",
-    "class MultiHeadAttention(keras.layers.Layer):\n",
-    "    def __init__(self, n_heads, causal=False, use_scale=False, **kwargs):\n",
-    "        self.n_heads = n_heads\n",
-    "        self.causal = causal\n",
-    "        self.use_scale = use_scale\n",
-    "        super().__init__(**kwargs)\n",
-    "    def build(self, batch_input_shape):\n",
-    "        self.dims = batch_input_shape[0][-1]\n",
-    "        self.q_dims, self.v_dims, self.k_dims = [self.dims // self.n_heads] * 3 # could be hyperparameters instead\n",
-    "        self.q_linear = keras.layers.Conv1D(self.n_heads * self.q_dims, kernel_size=1, use_bias=False)\n",
-    "        self.v_linear = keras.layers.Conv1D(self.n_heads * self.v_dims, kernel_size=1, use_bias=False)\n",
-    "        self.k_linear = keras.layers.Conv1D(self.n_heads * self.k_dims, kernel_size=1, use_bias=False)\n",
-    "        self.attention = keras.layers.Attention(causal=self.causal, use_scale=self.use_scale)\n",
-    "        self.out_linear = keras.layers.Conv1D(self.dims, kernel_size=1, use_bias=False)\n",
-    "        super().build(batch_input_shape)\n",
-    "    def _multi_head_linear(self, inputs, linear):\n",
-    "        shape = K.concatenate([K.shape(inputs)[:-1], [self.n_heads, -1]])\n",
-    "        projected = K.reshape(linear(inputs), shape)\n",
-    "        perm = K.permute_dimensions(projected, [0, 2, 1, 3])\n",
-    "        return K.reshape(perm, [shape[0] * self.n_heads, shape[1], -1])\n",
-    "    def call(self, inputs):\n",
-    "        q = inputs[0]\n",
-    "        v = inputs[1]\n",
-    "        k = inputs[2] if len(inputs) > 2 else v\n",
-    "        shape = K.shape(q)\n",
-    "        q_proj = self._multi_head_linear(q, self.q_linear)\n",
-    "        v_proj = self._multi_head_linear(v, self.v_linear)\n",
-    "        k_proj = self._multi_head_linear(k, self.k_linear)\n",
-    "        multi_attended = self.attention([q_proj, v_proj, k_proj])\n",
-    "        shape_attended = K.shape(multi_attended)\n",
-    "        reshaped_attended = K.reshape(multi_attended, [shape[0], self.n_heads, shape_attended[1], shape_attended[2]])\n",
-    "        perm = K.permute_dimensions(reshaped_attended, [0, 2, 1, 3])\n",
-    "        concat = K.reshape(perm, [shape[0], shape_attended[1], -1])\n",
-    "        return self.out_linear(concat)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 133,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "Q = np.random.rand(2, 50, 512)\n",
-    "V = np.random.rand(2, 80, 512)\n",
-    "multi_attn = MultiHeadAttention(8)\n",
-    "multi_attn([Q, V]).shape"
+    "In this chapter we explored the fundamentals of RNNs and used them to process sequences (namely, time series). In the process we also looked at other ways to process sequences, including CNNs. In the next chapter we will use RNNs for Natural Language Processing, and we will learn more about RNNs (bidirectional RNNs, stateful vs stateless RNNs, Encoder–Decoders, and Attention-augmented Encoder-Decoders). We will also look at the Transformer, an Attention-only architecture."
    ]
   },
   {
@@ -2237,7 +1149,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 134,
+   "execution_count": 58,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2282,7 +1194,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 135,
+   "execution_count": 59,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2299,7 +1211,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 136,
+   "execution_count": 60,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2316,7 +1228,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 137,
+   "execution_count": 61,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2337,7 +1249,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 138,
+   "execution_count": 62,
    "metadata": {},
    "outputs": [],
    "source": [
diff --git a/16_nlp_with_rnns_and_attention.ipynb b/16_nlp_with_rnns_and_attention.ipynb
new file mode 100644
index 0000000..07d63e0
--- /dev/null
+++ b/16_nlp_with_rnns_and_attention.ipynb
@@ -0,0 +1,1228 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Chapter 16 – Natural Language Processing with RNNs and Attention**"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "_This notebook contains all the sample code in chapter 16._"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Setup"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "First, let's import a few common modules, ensure MatplotLib plots figures inline and prepare a function to save the figures. We also check that Python 3.5 or later is installed (although Python 2.x may work, it is deprecated so we strongly recommend you use Python 3 instead), as well as Scikit-Learn ≥0.20 and TensorFlow ≥2.0-preview."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Python ≥3.5 is required\n",
+    "import sys\n",
+    "assert sys.version_info >= (3, 5)\n",
+    "\n",
+    "# Scikit-Learn ≥0.20 is required\n",
+    "import sklearn\n",
+    "assert sklearn.__version__ >= \"0.20\"\n",
+    "\n",
+    "# TensorFlow ≥2.0-preview is required\n",
+    "import tensorflow as tf\n",
+    "from tensorflow import keras\n",
+    "assert tf.__version__ >= \"2.0\"\n",
+    "\n",
+    "# Common imports\n",
+    "import numpy as np\n",
+    "import os\n",
+    "\n",
+    "# to make this notebook's output stable across runs\n",
+    "np.random.seed(42)\n",
+    "tf.random.set_seed(42)\n",
+    "\n",
+    "# To plot pretty figures\n",
+    "%matplotlib inline\n",
+    "import matplotlib as mpl\n",
+    "import matplotlib.pyplot as plt\n",
+    "mpl.rc('axes', labelsize=14)\n",
+    "mpl.rc('xtick', labelsize=12)\n",
+    "mpl.rc('ytick', labelsize=12)\n",
+    "\n",
+    "# Where to save the figures\n",
+    "PROJECT_ROOT_DIR = \".\"\n",
+    "CHAPTER_ID = \"nlp\"\n",
+    "IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, \"images\", CHAPTER_ID)\n",
+    "os.makedirs(IMAGES_PATH, exist_ok=True)\n",
+    "\n",
+    "def save_fig(fig_id, tight_layout=True, fig_extension=\"png\", resolution=300):\n",
+    "    path = os.path.join(IMAGES_PATH, fig_id + \".\" + fig_extension)\n",
+    "    print(\"Saving figure\", fig_id)\n",
+    "    if tight_layout:\n",
+    "        plt.tight_layout()\n",
+    "    plt.savefig(path, format=fig_extension, dpi=resolution)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Char-RNN"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Splitting a sequence into batches of shuffled windows"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "For example, let's split the sequence 0 to 14 into windows of length 5, each shifted by 2 (e.g.,`[0, 1, 2, 3, 4]`, `[2, 3, 4, 5, 6]`, etc.), then shuffle them, and split them into inputs (the first 4 steps) and targets (the last 4 steps) (e.g., `[2, 3, 4, 5, 6]` would be split into `[[2, 3, 4, 5], [3, 4, 5, 6]]`), then create batches of 3 such input/target pairs:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "np.random.seed(42)\n",
+    "tf.random.set_seed(42)\n",
+    "\n",
+    "n_steps = 5\n",
+    "dataset = tf.data.Dataset.from_tensor_slices(tf.range(15))\n",
+    "dataset = dataset.window(n_steps, shift=2, drop_remainder=True)\n",
+    "dataset = dataset.flat_map(lambda window: window.batch(n_steps))\n",
+    "dataset = dataset.shuffle(10).map(lambda window: (window[:-1], window[1:]))\n",
+    "dataset = dataset.batch(3).prefetch(1)\n",
+    "for index, (X_batch, Y_batch) in enumerate(dataset):\n",
+    "    print(\"_\" * 20, \"Batch\", index, \"\\nX_batch\")\n",
+    "    print(X_batch.numpy())\n",
+    "    print(\"=\" * 5, \"\\nY_batch\")\n",
+    "    print(Y_batch.numpy())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Loading the Data and Preparing the Dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "shakespeare_url = \"https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt\"\n",
+    "filepath = keras.utils.get_file(\"shakespeare.txt\", shakespeare_url)\n",
+    "with open(filepath) as f:\n",
+    "    shakespeare_text = f.read()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(shakespeare_text[:148])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\"\".join(sorted(set(shakespeare_text.lower())))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tokenizer = keras.preprocessing.text.Tokenizer(char_level=True)\n",
+    "tokenizer.fit_on_texts(shakespeare_text)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tokenizer.texts_to_sequences([\"First\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tokenizer.sequences_to_texts([[20, 6, 9, 8, 3]])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "max_id = len(tokenizer.word_index) # number of distinct characters\n",
+    "dataset_size = tokenizer.document_count # total number of characters"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "[encoded] = np.array(tokenizer.texts_to_sequences([shakespeare_text])) - 1\n",
+    "train_size = dataset_size * 90 // 100\n",
+    "dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "n_steps = 100\n",
+    "window_length = n_steps + 1 # target = input shifted 1 character ahead\n",
+    "dataset = dataset.repeat().window(window_length, shift=1, drop_remainder=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataset = dataset.flat_map(lambda window: window.batch(window_length))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "np.random.seed(42)\n",
+    "tf.random.set_seed(42)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "batch_size = 32\n",
+    "dataset = dataset.shuffle(10000).batch(batch_size)\n",
+    "dataset = dataset.map(lambda windows: (windows[:, :-1], windows[:, 1:]))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataset = dataset.map(\n",
+    "    lambda X_batch, Y_batch: (tf.one_hot(X_batch, depth=max_id), Y_batch))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataset = dataset.prefetch(1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for X_batch, Y_batch in dataset.take(1):\n",
+    "    print(X_batch.shape, Y_batch.shape)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Creating and Training the Model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = keras.models.Sequential([\n",
+    "    keras.layers.GRU(128, return_sequences=True, input_shape=[None, max_id],\n",
+    "                     dropout=0.2, recurrent_dropout=0.2),\n",
+    "    keras.layers.GRU(128, return_sequences=True,\n",
+    "                     dropout=0.2, recurrent_dropout=0.2),\n",
+    "    keras.layers.TimeDistributed(keras.layers.Dense(max_id,\n",
+    "                                                    activation=\"softmax\"))\n",
+    "])\n",
+    "model.compile(loss=\"sparse_categorical_crossentropy\", optimizer=\"adam\")\n",
+    "history = model.fit(dataset, steps_per_epoch=train_size // batch_size,\n",
+    "                    epochs=10)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Using the Model to Generate Text"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def preprocess(texts):\n",
+    "    X = np.array(tokenizer.texts_to_sequences(texts)) - 1\n",
+    "    return tf.one_hot(X, max_id)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X_new = preprocess([\"How are yo\"])\n",
+    "Y_pred = model.predict_classes(X_new)\n",
+    "tokenizer.sequences_to_texts(Y_pred + 1)[0][-1] # 1st sentence, last char"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tf.random.set_seed(42)\n",
+    "\n",
+    "tf.random.categorical([[np.log(0.5), np.log(0.4), np.log(0.1)]], num_samples=40).numpy()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def next_char(text, temperature=1):\n",
+    "    X_new = preprocess([text])\n",
+    "    y_proba = model.predict(X_new)[0, -1:, :]\n",
+    "    rescaled_logits = tf.math.log(y_proba) / temperature\n",
+    "    char_id = tf.random.categorical(rescaled_logits, num_samples=1) + 1\n",
+    "    return tokenizer.sequences_to_texts(char_id.numpy())[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tf.random.set_seed(42)\n",
+    "\n",
+    "next_char(\"How are yo\", temperature=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def complete_text(text, n_chars=50, temperature=1):\n",
+    "    for _ in range(n_chars):\n",
+    "        text += next_char(text, temperature)\n",
+    "    return text"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tf.random.set_seed(42)\n",
+    "\n",
+    "print(complete_text(\"t\", temperature=0.2))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(complete_text(\"t\", temperature=1))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(complete_text(\"t\", temperature=2))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Stateful RNN"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tf.random.set_seed(42)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size])\n",
+    "dataset = dataset.window(window_length, shift=n_steps, drop_remainder=True)\n",
+    "dataset = dataset.flat_map(lambda window: window.batch(window_length))\n",
+    "dataset = dataset.repeat().batch(1)\n",
+    "dataset = dataset.map(lambda windows: (windows[:, :-1], windows[:, 1:]))\n",
+    "dataset = dataset.map(\n",
+    "    lambda X_batch, Y_batch: (tf.one_hot(X_batch, depth=max_id), Y_batch))\n",
+    "dataset = dataset.prefetch(1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "batch_size = 32\n",
+    "encoded_parts = np.array_split(encoded[:train_size], batch_size)\n",
+    "datasets = []\n",
+    "for encoded_part in encoded_parts:\n",
+    "    dataset = tf.data.Dataset.from_tensor_slices(encoded_part)\n",
+    "    dataset = dataset.window(window_length, shift=n_steps, drop_remainder=True)\n",
+    "    dataset = dataset.flat_map(lambda window: window.batch(window_length))\n",
+    "    datasets.append(dataset)\n",
+    "dataset = tf.data.Dataset.zip(tuple(datasets)).map(lambda *windows: tf.stack(windows))\n",
+    "dataset = dataset.repeat().map(lambda windows: (windows[:, :-1], windows[:, 1:]))\n",
+    "dataset = dataset.map(\n",
+    "    lambda X_batch, Y_batch: (tf.one_hot(X_batch, depth=max_id), Y_batch))\n",
+    "dataset = dataset.prefetch(1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = keras.models.Sequential([\n",
+    "    keras.layers.GRU(128, return_sequences=True, stateful=True,\n",
+    "#                     dropout=0.2, recurrent_dropout=0.2, # see TF issue #27829\n",
+    "                     batch_input_shape=[batch_size, None, max_id]),\n",
+    "    keras.layers.GRU(128, return_sequences=True, stateful=True\n",
+    "#                     dropout=0.2, recurrent_dropout=0.2 # see TF issue #27829\n",
+    "                    ),\n",
+    "    keras.layers.TimeDistributed(keras.layers.Dense(max_id,\n",
+    "                                                    activation=\"softmax\"))\n",
+    "])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class ResetStatesCallback(keras.callbacks.Callback):\n",
+    "    def on_epoch_begin(self, epoch, logs):\n",
+    "        self.model.reset_states()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model.compile(loss=\"sparse_categorical_crossentropy\", optimizer=\"adam\")\n",
+    "steps_per_epoch = train_size // batch_size // n_steps\n",
+    "model.fit(dataset, steps_per_epoch=steps_per_epoch, epochs=50,\n",
+    "                   callbacks=[ResetStatesCallback()])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "To use the model with different batch sizes, we need to create a stateless copy. We can get rid of dropout since it is only used during training:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "stateless_model = keras.models.Sequential([\n",
+    "    keras.layers.GRU(128, return_sequences=True, input_shape=[None, max_id]),\n",
+    "    keras.layers.GRU(128, return_sequences=True),\n",
+    "    keras.layers.TimeDistributed(keras.layers.Dense(max_id,\n",
+    "                                                    activation=\"softmax\"))\n",
+    "])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "To set the weights, we first need to build the model (so the weights get created):"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "stateless_model.build(tf.TensorShape([None, None, max_id]))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "stateless_model.set_weights(model.get_weights())\n",
+    "model = stateless_model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tf.random.set_seed(42)\n",
+    "\n",
+    "print(complete_text(\"t\"))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Sentiment Analysis"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 38,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tf.random.set_seed(42)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "You can load the IMDB dataset easily:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "(X_train, y_test), (X_valid, y_test) = keras.datasets.imdb.load_data()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 40,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X_train[0][:10]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "word_index = keras.datasets.imdb.get_word_index()\n",
+    "id_to_word = {id_ + 3: word for word, id_ in word_index.items()}\n",
+    "for id_, token in enumerate((\"<pad>\", \"<sos>\", \"<unk>\")):\n",
+    "    id_to_word[id_] = token\n",
+    "\" \".join([id_to_word[id_] for id_ in X_train[0][:10]])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 42,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import tensorflow_datasets as tfds\n",
+    "\n",
+    "datasets, info = tfds.load(\"imdb_reviews\", as_supervised=True, with_info=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "datasets.keys()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 44,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_size = info.splits[\"train\"].num_examples\n",
+    "test_size = info.splits[\"test\"].num_examples"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 45,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_size, test_size"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 46,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for X_batch, y_batch in datasets[\"train\"].batch(2).take(1):\n",
+    "    for review, label in zip(X_batch.numpy(), y_batch.numpy()):\n",
+    "        print(\"Review:\", review.decode(\"utf-8\")[:200], \"...\")\n",
+    "        print(\"Label:\", label, \"= Positive\" if label else \"= Negative\")\n",
+    "        print()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 47,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def preprocess(X_batch, y_batch):\n",
+    "    X_batch = tf.strings.substr(X_batch, 0, 300)\n",
+    "    X_batch = tf.strings.regex_replace(X_batch, rb\"<br\\s*/?>\", b\" \")\n",
+    "    X_batch = tf.strings.regex_replace(X_batch, b\"[^a-zA-Z']\", b\" \")\n",
+    "    X_batch = tf.strings.split(X_batch)\n",
+    "    return X_batch.to_tensor(default_value=b\"<pad>\"), y_batch"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 48,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "preprocess(X_batch, y_batch)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 49,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from collections import Counter\n",
+    "\n",
+    "vocabulary = Counter()\n",
+    "for X_batch, y_batch in datasets[\"train\"].batch(32).map(preprocess):\n",
+    "    for review in X_batch:\n",
+    "        vocabulary.update(list(review.numpy()))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 50,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "vocabulary.most_common()[:3]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 51,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "len(vocabulary)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 52,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "vocab_size = 10000\n",
+    "truncated_vocabulary = [\n",
+    "    word for word, count in vocabulary.most_common()[:vocab_size]]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 53,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "word_to_id = {word: index for index, word in enumerate(truncated_vocabulary)}\n",
+    "for word in b\"This movie was faaaaaantastic\".split():\n",
+    "    print(word_to_id.get(word) or vocab_size)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 54,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "words = tf.constant(truncated_vocabulary)\n",
+    "word_ids = tf.range(len(truncated_vocabulary), dtype=tf.int64)\n",
+    "vocab_init = tf.lookup.KeyValueTensorInitializer(words, word_ids)\n",
+    "num_oov_buckets = 1000\n",
+    "table = tf.lookup.StaticVocabularyTable(vocab_init, num_oov_buckets)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 55,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "table.lookup(tf.constant([b\"This movie was faaaaaantastic\".split()]))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 56,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def encode_words(X_batch, y_batch):\n",
+    "    return table.lookup(X_batch), y_batch\n",
+    "\n",
+    "train_set = datasets[\"train\"].repeat().batch(32).map(preprocess)\n",
+    "train_set = train_set.map(encode_words).prefetch(1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 57,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for X_batch, y_batch in train_set.take(1):\n",
+    "    print(X_batch)\n",
+    "    print(y_batch)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 58,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "embed_size = 128\n",
+    "model = keras.models.Sequential([\n",
+    "    keras.layers.Embedding(vocab_size + num_oov_buckets, embed_size,\n",
+    "                           mask_zero=True, # not shown in the book\n",
+    "                           input_shape=[None]),\n",
+    "    keras.layers.GRU(128, return_sequences=True),\n",
+    "    keras.layers.GRU(128),\n",
+    "    keras.layers.Dense(1, activation=\"sigmoid\")\n",
+    "])\n",
+    "model.compile(loss=\"binary_crossentropy\", optimizer=\"adam\", metrics=[\"accuracy\"])\n",
+    "history = model.fit(train_set, steps_per_epoch=train_size // 32, epochs=5)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Or using manual masking:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 59,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "K = keras.backend\n",
+    "embed_size = 128\n",
+    "inputs = keras.layers.Input(shape=[None])\n",
+    "mask = keras.layers.Lambda(lambda inputs: K.not_equal(inputs, 0))(inputs)\n",
+    "z = keras.layers.Embedding(vocab_size + num_oov_buckets, embed_size)(inputs)\n",
+    "z = keras.layers.GRU(128, return_sequences=True)(z, mask=mask)\n",
+    "z = keras.layers.GRU(128)(z, mask=mask)\n",
+    "outputs = keras.layers.Dense(1, activation=\"sigmoid\")(z)\n",
+    "model = keras.models.Model(inputs=[inputs], outputs=[outputs])\n",
+    "model.compile(loss=\"binary_crossentropy\", optimizer=\"adam\", metrics=[\"accuracy\"])\n",
+    "history = model.fit(train_set, steps_per_epoch=train_size // 32, epochs=5)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Reusing Pretrained Embeddings"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 60,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tf.random.set_seed(42)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 61,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "TFHUB_CACHE_DIR = os.path.join(os.curdir, \"my_tfhub_cache\")\n",
+    "os.environ[\"TFHUB_CACHE_DIR\"] = TFHUB_CACHE_DIR"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 62,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import tensorflow_hub as hub\n",
+    "\n",
+    "model = keras.Sequential([\n",
+    "    hub.KerasLayer(\"https://tfhub.dev/google/tf2-preview/nnlm-en-dim50/1\",\n",
+    "                   dtype=tf.string, input_shape=[], output_shape=[50]),\n",
+    "    keras.layers.Dense(128, activation=\"relu\"),\n",
+    "    keras.layers.Dense(1, activation=\"sigmoid\")\n",
+    "])\n",
+    "model.compile(loss=\"binary_crossentropy\", optimizer=\"adam\",\n",
+    "              metrics=[\"accuracy\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 63,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for dirpath, dirnames, filenames in os.walk(TFHUB_CACHE_DIR):\n",
+    "    for filename in filenames:\n",
+    "        print(os.path.join(dirpath, filename))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 64,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import tensorflow_datasets as tfds\n",
+    "\n",
+    "datasets, info = tfds.load(\"imdb_reviews\", as_supervised=True, with_info=True)\n",
+    "train_size = info.splits[\"train\"].num_examples\n",
+    "batch_size = 32\n",
+    "train_set = datasets[\"train\"].repeat().batch(batch_size).prefetch(1)\n",
+    "history = model.fit(train_set, steps_per_epoch=train_size // batch_size, epochs=5)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Automatic Translation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 65,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tf.random.set_seed(42)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 66,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "vocab_size = 100\n",
+    "embed_size = 10"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 67,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import tensorflow_addons as tfa\n",
+    "\n",
+    "encoder_inputs = keras.layers.Input(shape=[None], dtype=np.int32)\n",
+    "decoder_inputs = keras.layers.Input(shape=[None], dtype=np.int32)\n",
+    "sequence_lengths = keras.layers.Input(shape=[], dtype=np.int32)\n",
+    "\n",
+    "embeddings = keras.layers.Embedding(vocab_size, embed_size)\n",
+    "encoder_embeddings = embeddings(encoder_inputs)\n",
+    "decoder_embeddings = embeddings(decoder_inputs)\n",
+    "\n",
+    "encoder = keras.layers.LSTM(512, return_state=True)\n",
+    "encoder_outputs, state_h, state_c = encoder(encoder_embeddings)\n",
+    "encoder_state = [state_h, state_c]\n",
+    "\n",
+    "sampler = tfa.seq2seq.sampler.TrainingSampler()\n",
+    "\n",
+    "decoder_cell = keras.layers.LSTMCell(512)\n",
+    "output_layer = keras.layers.Dense(vocab_size)\n",
+    "decoder = tfa.seq2seq.basic_decoder.BasicDecoder(decoder_cell, sampler,\n",
+    "                                                 output_layer=output_layer)\n",
+    "final_outputs, final_state, final_sequence_lengths = decoder(\n",
+    "    decoder_embeddings, initial_state=encoder_state,\n",
+    "    sequence_length=sequence_lengths)\n",
+    "Y_proba = tf.nn.softmax(final_outputs.rnn_output)\n",
+    "\n",
+    "model = keras.models.Model(\n",
+    "    inputs=[encoder_inputs, decoder_inputs, sequence_lengths],\n",
+    "    outputs=[Y_proba])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 68,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model.compile(loss=\"sparse_categorical_crossentropy\", optimizer=\"adam\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 69,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X = np.random.randint(100, size=10*1000).reshape(1000, 10)\n",
+    "Y = np.random.randint(100, size=15*1000).reshape(1000, 15)\n",
+    "X_decoder = np.c_[np.zeros((1000, 1)), Y[:, :-1]]\n",
+    "seq_lengths = np.full([1000], 15)\n",
+    "\n",
+    "history = model.fit([X, X_decoder, seq_lengths], Y, epochs=2)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Bidirectional Recurrent Layers"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 70,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = keras.models.Sequential([\n",
+    "    keras.layers.GRU(10, return_sequences=True, input_shape=[None, 10]),\n",
+    "    keras.layers.Bidirectional(keras.layers.GRU(10, return_sequences=True))\n",
+    "])\n",
+    "\n",
+    "model.summary()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Positional Encoding"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 71,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class PositionalEncoding(keras.layers.Layer):\n",
+    "    def __init__(self, max_steps, max_dims, dtype=tf.float32, **kwargs):\n",
+    "        super().__init__(dtype=dtype, **kwargs)\n",
+    "        if max_dims % 2 == 1: max_dims += 1 # max_dims must be even\n",
+    "        p, i = np.meshgrid(np.arange(max_steps), np.arange(max_dims // 2))\n",
+    "        pos_emb = np.empty((1, max_steps, max_dims))\n",
+    "        pos_emb[0, :, ::2] = np.sin(p / 10000**(2 * i / max_dims)).T\n",
+    "        pos_emb[0, :, 1::2] = np.cos(p / 10000**(2 * i / max_dims)).T\n",
+    "        self.positional_embedding = tf.constant(pos_emb.astype(self.dtype))\n",
+    "    def call(self, inputs):\n",
+    "        shape = tf.shape(inputs)\n",
+    "        return inputs + self.positional_embedding[:, :shape[-2], :shape[-1]]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 72,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "max_steps = 201\n",
+    "max_dims = 512\n",
+    "pos_emb = PositionalEncoding(max_steps, max_dims)\n",
+    "PE = pos_emb(np.zeros((1, max_steps, max_dims), np.float32))[0].numpy()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 73,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "i1, i2, crop_i = 100, 101, 150\n",
+    "p1, p2, p3 = 22, 60, 35\n",
+    "fig, (ax1, ax2) = plt.subplots(nrows=2, ncols=1, sharex=True, figsize=(9, 5))\n",
+    "ax1.plot([p1, p1], [-1, 1], \"k--\", label=\"$p = {}$\".format(p1))\n",
+    "ax1.plot([p2, p2], [-1, 1], \"k--\", label=\"$p = {}$\".format(p2), alpha=0.5)\n",
+    "ax1.plot(p3, PE[p3, i1], \"bx\", label=\"$p = {}$\".format(p3))\n",
+    "ax1.plot(PE[:,i1], \"b-\", label=\"$i = {}$\".format(i1))\n",
+    "ax1.plot(PE[:,i2], \"r-\", label=\"$i = {}$\".format(i2))\n",
+    "ax1.plot([p1, p2], [PE[p1, i1], PE[p2, i1]], \"bo\")\n",
+    "ax1.plot([p1, p2], [PE[p1, i2], PE[p2, i2]], \"ro\")\n",
+    "ax1.legend(loc=\"center right\", fontsize=14, framealpha=0.95)\n",
+    "ax1.set_ylabel(\"$P_{(p,i)}$\", rotation=0, fontsize=16)\n",
+    "ax1.grid(True, alpha=0.3)\n",
+    "ax1.hlines(0, 0, max_steps - 1, color=\"k\", linewidth=1, alpha=0.3)\n",
+    "ax1.axis([0, max_steps - 1, -1, 1])\n",
+    "ax2.imshow(PE.T[:crop_i], cmap=\"gray\", interpolation=\"bilinear\", aspect=\"auto\")\n",
+    "ax2.hlines(i1, 0, max_steps - 1, color=\"b\")\n",
+    "cheat = 2 # need to raise the red line a bit, or else it hides the blue one\n",
+    "ax2.hlines(i2+cheat, 0, max_steps - 1, color=\"r\")\n",
+    "ax2.plot([p1, p1], [0, crop_i], \"k--\")\n",
+    "ax2.plot([p2, p2], [0, crop_i], \"k--\", alpha=0.5)\n",
+    "ax2.plot([p1, p2], [i2+cheat, i2+cheat], \"ro\")\n",
+    "ax2.plot([p1, p2], [i1, i1], \"bo\")\n",
+    "ax2.axis([0, max_steps - 1, 0, crop_i])\n",
+    "ax2.set_xlabel(\"$p$\", fontsize=16)\n",
+    "ax2.set_ylabel(\"$i$\", rotation=0, fontsize=16)\n",
+    "plt.savefig(\"positional_embedding_plot\")\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 74,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "embed_size = 512; max_steps = 500; vocab_size = 10000\n",
+    "encoder_inputs = keras.layers.Input(shape=[None], dtype=np.int32)\n",
+    "decoder_inputs = keras.layers.Input(shape=[None], dtype=np.int32)\n",
+    "embeddings = keras.layers.Embedding(vocab_size, embed_size)\n",
+    "encoder_embeddings = embeddings(encoder_inputs)\n",
+    "decoder_embeddings = embeddings(decoder_inputs)\n",
+    "positional_encoding = PositionalEncoding(max_steps, max_dims=embed_size)\n",
+    "encoder_in = positional_encoding(encoder_embeddings)\n",
+    "decoder_in = positional_encoding(decoder_embeddings)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Here is a (very) simplified Transformer (the actual architecture has skip connections, layer norm, dense nets, and most importantly it uses Multi-Head Attention instead of regular Attention):"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 75,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for N in range(6):\n",
+    "    encoder_attn = keras.layers.Attention(use_scale=True)\n",
+    "    encoder_in = encoder_attn([encoder_in, encoder_in])\n",
+    "    masked_decoder_attn = keras.layers.Attention(use_scale=True, causal=True)\n",
+    "    decoder_in = masked_decoder_attn([decoder_in, decoder_in])\n",
+    "    decoder_attn = keras.layers.Attention(use_scale=True)\n",
+    "    final_enc = decoder_attn([decoder_in, encoder_in])\n",
+    "\n",
+    "output_layer = keras.layers.TimeDistributed(\n",
+    "    keras.layers.Dense(vocab_size, activation=\"softmax\"))\n",
+    "outputs = output_layer(final_enc)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Here's a basic implementation of the `MultiHeadAttention` layer. One will likely be added to `keras.layers` in the near future. Note that `Conv1D` layers with `kernel_size=1` (and the default `padding=\"valid\"` and `strides=1`) is equivalent to a `TimeDistributed(Dense(...))` layer."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 76,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "K = keras.backend\n",
+    "\n",
+    "class MultiHeadAttention(keras.layers.Layer):\n",
+    "    def __init__(self, n_heads, causal=False, use_scale=False, **kwargs):\n",
+    "        self.n_heads = n_heads\n",
+    "        self.causal = causal\n",
+    "        self.use_scale = use_scale\n",
+    "        super().__init__(**kwargs)\n",
+    "    def build(self, batch_input_shape):\n",
+    "        self.dims = batch_input_shape[0][-1]\n",
+    "        self.q_dims, self.v_dims, self.k_dims = [self.dims // self.n_heads] * 3 # could be hyperparameters instead\n",
+    "        self.q_linear = keras.layers.Conv1D(self.n_heads * self.q_dims, kernel_size=1, use_bias=False)\n",
+    "        self.v_linear = keras.layers.Conv1D(self.n_heads * self.v_dims, kernel_size=1, use_bias=False)\n",
+    "        self.k_linear = keras.layers.Conv1D(self.n_heads * self.k_dims, kernel_size=1, use_bias=False)\n",
+    "        self.attention = keras.layers.Attention(causal=self.causal, use_scale=self.use_scale)\n",
+    "        self.out_linear = keras.layers.Conv1D(self.dims, kernel_size=1, use_bias=False)\n",
+    "        super().build(batch_input_shape)\n",
+    "    def _multi_head_linear(self, inputs, linear):\n",
+    "        shape = K.concatenate([K.shape(inputs)[:-1], [self.n_heads, -1]])\n",
+    "        projected = K.reshape(linear(inputs), shape)\n",
+    "        perm = K.permute_dimensions(projected, [0, 2, 1, 3])\n",
+    "        return K.reshape(perm, [shape[0] * self.n_heads, shape[1], -1])\n",
+    "    def call(self, inputs):\n",
+    "        q = inputs[0]\n",
+    "        v = inputs[1]\n",
+    "        k = inputs[2] if len(inputs) > 2 else v\n",
+    "        shape = K.shape(q)\n",
+    "        q_proj = self._multi_head_linear(q, self.q_linear)\n",
+    "        v_proj = self._multi_head_linear(v, self.v_linear)\n",
+    "        k_proj = self._multi_head_linear(k, self.k_linear)\n",
+    "        multi_attended = self.attention([q_proj, v_proj, k_proj])\n",
+    "        shape_attended = K.shape(multi_attended)\n",
+    "        reshaped_attended = K.reshape(multi_attended, [shape[0], self.n_heads, shape_attended[1], shape_attended[2]])\n",
+    "        perm = K.permute_dimensions(reshaped_attended, [0, 2, 1, 3])\n",
+    "        concat = K.reshape(perm, [shape[0], shape_attended[1], -1])\n",
+    "        return self.out_linear(concat)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 77,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "Q = np.random.rand(2, 50, 512)\n",
+    "V = np.random.rand(2, 80, 512)\n",
+    "multi_attn = MultiHeadAttention(8)\n",
+    "multi_attn([Q, V]).shape"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.8"
+  },
+  "nav_menu": {},
+  "toc": {
+   "navigate_menu": true,
+   "number_sections": true,
+   "sideBar": true,
+   "threshold": 6,
+   "toc_cell": false,
+   "toc_section_display": "block",
+   "toc_window_display": false
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}