diff --git a/16_nlp_with_rnns_and_attention.ipynb b/16_nlp_with_rnns_and_attention.ipynb
index 901a069..9bbef1f 100644
--- a/16_nlp_with_rnns_and_attention.ipynb
+++ b/16_nlp_with_rnns_and_attention.ipynb
@@ -28,16 +28,6 @@
""
]
},
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# WORK IN PROGRESS\n",
- "\n",
- "\n",
- "**I'm still working on updating this chapter to the 3rd edition. Please come back in a few weeks.**"
- ]
- },
{
"cell_type": "markdown",
"metadata": {
@@ -59,7 +49,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 1,
"metadata": {
"id": "TFSU3FCOpKzu"
},
@@ -81,7 +71,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 2,
"metadata": {
"id": "YqCwW7cMpKzw"
},
@@ -103,7 +93,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 3,
"metadata": {
"id": "0Piq5se2pKzx"
},
@@ -125,7 +115,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 4,
"metadata": {
"id": "8d4TH3NbpKzx"
},
@@ -151,7 +141,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 5,
"metadata": {
"id": "PQFH5Y9PpKzy"
},
@@ -180,7 +170,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 6,
"metadata": {
"id": "Ekxzo6pOpKzy"
},
@@ -199,86 +189,42 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "This notebooks uses the TensorFlow Addons library, and the Transformers library. If you're running on Colab, then we need to install them now:"
+ "# Generating Shakespearean Text Using a Character RNN"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Creating the Training Dataset"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Let's download the Shakespeare data from Andrej Karpathy's [char-rnn project](https://github.com/karpathy/char-rnn/)"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 7,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Downloading data from https://homl.info/shakespeare\n",
+ "1122304/1115394 [==============================] - 0s 0us/step\n",
+ "1130496/1115394 [==============================] - 0s 0us/step\n"
+ ]
+ }
+ ],
"source": [
- "if \"google.colab\" in sys.modules:\n",
- " %pip install -q -U tensorflow-addons\n",
- " %pip install -q -U transformers"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "First, let's import a few common modules, ensure MatplotLib plots figures inline and prepare a function to save the figures."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Char-RNN"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Splitting a sequence into batches of shuffled windows"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "For example, let's split the sequence 0 to 14 into windows of length 5, each shifted by 2 (e.g.,`[0, 1, 2, 3, 4]`, `[2, 3, 4, 5, 6]`, etc.), then shuffle them, and split them into inputs (the first 4 steps) and targets (the last 4 steps) (e.g., `[2, 3, 4, 5, 6]` would be split into `[[2, 3, 4, 5], [3, 4, 5, 6]]`), then create batches of 3 such input/target pairs:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "scrolled": true
- },
- "outputs": [],
- "source": [
- "np.random.seed(42)\n",
- "tf.random.set_seed(42)\n",
+ "import tensorflow as tf\n",
"\n",
- "n_steps = 5\n",
- "dataset = tf.data.Dataset.from_tensor_slices(tf.range(15))\n",
- "dataset = dataset.window(n_steps, shift=2, drop_remainder=True)\n",
- "dataset = dataset.flat_map(lambda window: window.batch(n_steps))\n",
- "dataset = dataset.shuffle(10).map(lambda window: (window[:-1], window[1:]))\n",
- "dataset = dataset.batch(3).prefetch(1)\n",
- "for index, (X_batch, Y_batch) in enumerate(dataset):\n",
- " print(\"_\" * 20, \"Batch\", index, \"\\nX_batch\")\n",
- " print(X_batch.numpy())\n",
- " print(\"=\" * 5, \"\\nY_batch\")\n",
- " print(Y_batch.numpy())"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Loading the Data and Preparing the Dataset"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "shakespeare_url = \"https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt\"\n",
+ "shakespeare_url = \"https://homl.info/shakespeare\" # shortcut URL\n",
"filepath = tf.keras.utils.get_file(\"shakespeare.txt\", shakespeare_url)\n",
"with open(filepath) as f:\n",
" shakespeare_text = f.read()"
@@ -286,269 +232,440 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 8,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "First Citizen:\n",
+ "Before we proceed any further, hear me speak.\n",
+ "\n",
+ "All:\n",
+ "Speak, speak.\n"
+ ]
+ }
+ ],
"source": [
- "print(shakespeare_text[:148])"
+ "# extra code – shows a short text sample\n",
+ "print(shakespeare_text[:80])"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 9,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "\"\\n !$&',-.3:;?abcdefghijklmnopqrstuvwxyz\""
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
+ "# extra code – shows all 39 distinct characters (after converting to lower case)\n",
"\"\".join(sorted(set(shakespeare_text.lower())))"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
- "tokenizer = tf.keras.preprocessing.text.Tokenizer(char_level=True)\n",
- "tokenizer.fit_on_texts(shakespeare_text)"
+ "text_vec_layer = tf.keras.layers.TextVectorization(split=\"character\",\n",
+ " standardize=\"lower\")\n",
+ "text_vec_layer.adapt([shakespeare_text])\n",
+ "encoded = text_vec_layer([shakespeare_text])[0]"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
- "tokenizer.texts_to_sequences([\"First\"])"
+ "encoded -= 2 # drop tokens 0 (pad) and 1 (unknown), which we will not use\n",
+ "n_tokens = text_vec_layer.vocabulary_size() - 2 # number of distinct chars = 39\n",
+ "dataset_size = len(encoded) # total number of chars = 1,115,394"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 12,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "39"
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "tokenizer.sequences_to_texts([[20, 6, 9, 8, 3]])"
+ "n_tokens"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 13,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "1115394"
+ ]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "max_id = len(tokenizer.word_index) # number of distinct characters\n",
- "dataset_size = tokenizer.document_count # total number of characters"
+ "dataset_size"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
- "[encoded] = np.array(tokenizer.texts_to_sequences([shakespeare_text])) - 1\n",
- "train_size = dataset_size * 90 // 100\n",
- "dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size])"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "**Note**: in previous versions of this code, we used `dataset.repeat()` now to make the dataset \"infinite\", and later in the notebook we set the `steps_per_epoch` argument when calling the `model.fit()` method. This was needed to work around some TensorFlow bugs. However, since these bugs have now been fixed, we can simplify the code: no need for `dataset.repeat()` or `steps_per_epoch` anymore."
+ "def to_dataset(sequence, length, shuffle=False, seed=None, batch_size=32):\n",
+ " ds = tf.data.Dataset.from_tensor_slices(sequence)\n",
+ " ds = ds.window(length + 1, shift=1, drop_remainder=True)\n",
+ " ds = ds.flat_map(lambda window_ds: window_ds.batch(length + 1))\n",
+ " if shuffle:\n",
+ " ds = ds.shuffle(100_000, seed=seed)\n",
+ " ds = ds.batch(batch_size)\n",
+ " return ds.map(lambda window: (window[:, :-1], window[:, 1:])).prefetch(1)"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 15,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[(,\n",
+ " )]"
+ ]
+ },
+ "execution_count": 15,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "n_steps = 100\n",
- "window_length = n_steps + 1 # target = input shifted 1 character ahead\n",
- "dataset = dataset.window(window_length, shift=1, drop_remainder=True)"
+ "# extra code – a simple example using to_dataset()\n",
+ "# There's just one sample in this dataset: the input represents \"to b\" and the\n",
+ "# output represents \"o be\"\n",
+ "list(to_dataset(text_vec_layer([\"To be\"])[0], length=4))"
]
},
{
"cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "dataset = dataset.flat_map(lambda window: window.batch(window_length))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "np.random.seed(42)\n",
- "tf.random.set_seed(42)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "batch_size = 32\n",
- "dataset = dataset.shuffle(10000).batch(batch_size)\n",
- "dataset = dataset.map(lambda windows: (windows[:, :-1], windows[:, 1:]))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "dataset = dataset.map(\n",
- " lambda X_batch, Y_batch: (tf.one_hot(X_batch, depth=max_id), Y_batch))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "dataset = dataset.prefetch(1)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "for X_batch, Y_batch in dataset.take(1):\n",
- " print(X_batch.shape, Y_batch.shape)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Creating and Training the Model"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "**Warning**: the following code may take up to 24 hours to run, depending on your hardware. If you use a GPU, it may take just 1 or 2 hours, or less."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "**Note**: the `GRU` class will only use the GPU (if you have one) when using the default values for the following arguments: `activation`, `recurrent_activation`, `recurrent_dropout`, `unroll`, `use_bias` and `reset_after`. This is why I commented out `recurrent_dropout=0.2` (compared to the book)."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "model = tf.keras.Sequential([\n",
- " tf.keras.layers.GRU(128, return_sequences=True, input_shape=[None, max_id],\n",
- " #dropout=0.2, recurrent_dropout=0.2),\n",
- " dropout=0.2),\n",
- " tf.keras.layers.GRU(128, return_sequences=True,\n",
- " #dropout=0.2, recurrent_dropout=0.2),\n",
- " dropout=0.2),\n",
- " tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(max_id,\n",
- " activation=\"softmax\"))\n",
- "])\n",
- "model.compile(loss=\"sparse_categorical_crossentropy\", optimizer=\"adam\")\n",
- "history = model.fit(dataset, epochs=10)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Using the Model to Generate Text"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "def preprocess(texts):\n",
- " X = np.array(tokenizer.texts_to_sequences(texts)) - 1\n",
- " return tf.one_hot(X, max_id)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "**Warning**: the `predict_classes()` method is deprecated. Instead, we must use `model(X_new).argmax(axis=-1)`."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "X_new = preprocess([\"How are yo\"])\n",
- "#Y_pred = model.predict_classes(X_new)\n",
- "Y_pred = model(X_new).argmax(axis=-1)\n",
- "tokenizer.sequences_to_texts(Y_pred + 1)[0][-1] # 1st sentence, last char"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
+ "execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
+ "length = 100\n",
"tf.random.set_seed(42)\n",
- "\n",
- "tf.random.categorical([[np.log(0.5), np.log(0.4), np.log(0.1)]], num_samples=40).numpy()"
+ "train_set = to_dataset(encoded[:1_000_000], length=length, shuffle=True,\n",
+ " seed=42)\n",
+ "valid_set = to_dataset(encoded[1_000_000:1_060_000], length=length)\n",
+ "test_set = to_dataset(encoded[1_060_000:], length=length)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Building and Training the Char-RNN Model"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**Warning**: the following code may one or two hours to run, depending on your GPU. Without a GPU, it may take over 24 hours. If you don't want to wait, just skip the next two code cells and run the code below to download a pretrained model."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**Note**: the `GRU` class will only use cuDNN acceleration (assuming you have a GPU) when using the default values for the following arguments: `activation`, `recurrent_activation`, `recurrent_dropout`, `unroll`, `use_bias` and `reset_after`."
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 17,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Epoch 1/10\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "INFO:tensorflow:Assets written to: my_shakespeare_model/assets\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "31247/31247 [==============================] - 1407s 45ms/step - loss: 1.3873 - accuracy: 0.5754 - val_loss: 1.6155 - val_accuracy: 0.5333\n",
+ "Epoch 2/10\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "INFO:tensorflow:Assets written to: my_shakespeare_model/assets\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "31247/31247 [==============================] - 1376s 44ms/step - loss: 1.2921 - accuracy: 0.5973 - val_loss: 1.5881 - val_accuracy: 0.5401\n",
+ "Epoch 3/10\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "INFO:tensorflow:Assets written to: my_shakespeare_model/assets\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "31247/31247 [==============================] - 1379s 44ms/step - loss: 1.2743 - accuracy: 0.6015 - val_loss: 1.5885 - val_accuracy: 0.5407\n",
+ "Epoch 4/10\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "INFO:tensorflow:Assets written to: my_shakespeare_model/assets\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "31247/31247 [==============================] - 1381s 44ms/step - loss: 1.2654 - accuracy: 0.6031 - val_loss: 1.5701 - val_accuracy: 0.5418\n",
+ "Epoch 5/10\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "INFO:tensorflow:Assets written to: my_shakespeare_model/assets\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "31247/31247 [==============================] - 1379s 44ms/step - loss: 1.2594 - accuracy: 0.6045 - val_loss: 1.5674 - val_accuracy: 0.5450\n",
+ "Epoch 6/10\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "INFO:tensorflow:Assets written to: my_shakespeare_model/assets\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "31247/31247 [==============================] - 1386s 44ms/step - loss: 1.2545 - accuracy: 0.6058 - val_loss: 1.5587 - val_accuracy: 0.5492\n",
+ "Epoch 7/10\n",
+ "31247/31247 [==============================] - 1381s 44ms/step - loss: 1.2514 - accuracy: 0.6062 - val_loss: 1.5532 - val_accuracy: 0.5460\n",
+ "Epoch 8/10\n",
+ "31247/31247 [==============================] - 1381s 44ms/step - loss: 1.2485 - accuracy: 0.6067 - val_loss: 1.5522 - val_accuracy: 0.5479\n",
+ "Epoch 9/10\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "INFO:tensorflow:Assets written to: my_shakespeare_model/assets\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "31247/31247 [==============================] - 1382s 44ms/step - loss: 1.2460 - accuracy: 0.6073 - val_loss: 1.5521 - val_accuracy: 0.5497\n",
+ "Epoch 10/10\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "INFO:tensorflow:Assets written to: my_shakespeare_model/assets\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "31247/31247 [==============================] - 1385s 44ms/step - loss: 1.2436 - accuracy: 0.6080 - val_loss: 1.5477 - val_accuracy: 0.5513\n"
+ ]
+ }
+ ],
+ "source": [
+ "tf.random.set_seed(42) # extra code – ensures reproducibility on CPU\n",
+ "model = tf.keras.Sequential([\n",
+ " tf.keras.layers.Embedding(input_dim=n_tokens, output_dim=16),\n",
+ " tf.keras.layers.GRU(128, return_sequences=True),\n",
+ " tf.keras.layers.Dense(n_tokens, activation=\"softmax\")\n",
+ "])\n",
+ "model.compile(loss=\"sparse_categorical_crossentropy\", optimizer=\"nadam\",\n",
+ " metrics=[\"accuracy\"])\n",
+ "model_ckpt = tf.keras.callbacks.ModelCheckpoint(\n",
+ " \"my_shakespeare_model\", monitor=\"val_accuracy\", save_best_only=True)\n",
+ "history = model.fit(train_set, validation_data=valid_set, epochs=10,\n",
+ " callbacks=[model_ckpt])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "shakespeare_model = tf.keras.Sequential([\n",
+ " text_vec_layer,\n",
+ " tf.keras.layers.Lambda(lambda X: X - 2), # no or tokens\n",
+ " model\n",
+ "])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "If you don't want to wait for training to complete, I've pretrained a model for you. The following code will download it. Uncomment the last line if you want to use it instead of the model trained above."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# extra code – downloads a pretrained model\n",
+ "url = \"https://github.com/ageron/data/raw/main/shakespeare_model.tgz\"\n",
+ "path = tf.keras.utils.get_file(\"shakespeare_model.tgz\", url, extract=True)\n",
+ "model_path = Path(path).with_name(\"shakespeare_model\")\n",
+ "#shakespeare_model = tf.keras.models.load_model(model_path)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'e'"
+ ]
+ },
+ "execution_count": 20,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "y_proba = shakespeare_model.predict([\"To be or not to b\"])[0, -1]\n",
+ "y_pred = tf.argmax(y_proba) # choose the most probable character ID\n",
+ "text_vec_layer.get_vocabulary()[y_pred + 2]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Generating Fake Shakespearean Text"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 21,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "log_probas = tf.math.log([[0.5, 0.4, 0.1]]) # probas = 50%, 40%, and 10%\n",
+ "tf.random.set_seed(42)\n",
+ "tf.random.categorical(log_probas, num_samples=8) # draw 8 samples"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"def next_char(text, temperature=1):\n",
- " X_new = preprocess([text])\n",
- " y_proba = model(X_new)[0, -1:, :]\n",
+ " y_proba = shakespeare_model.predict([text])[0, -1:]\n",
" rescaled_logits = tf.math.log(y_proba) / temperature\n",
- " char_id = tf.random.categorical(rescaled_logits, num_samples=1) + 1\n",
- " return tokenizer.sequences_to_texts(char_id.numpy())[0]"
+ " char_id = tf.random.categorical(rescaled_logits, num_samples=1)[0, 0]\n",
+ " return text_vec_layer.get_vocabulary()[char_id + 2]"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
- "tf.random.set_seed(42)\n",
- "\n",
- "next_char(\"How are yo\", temperature=1)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "def complete_text(text, n_chars=50, temperature=1):\n",
+ "def extend_text(text, n_chars=50, temperature=1):\n",
" for _ in range(n_chars):\n",
" text += next_char(text, temperature)\n",
" return text"
@@ -556,31 +673,68 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
- "tf.random.set_seed(42)\n",
- "\n",
- "print(complete_text(\"t\", temperature=0.2))"
+ "tf.random.set_seed(42) # extra code – ensures reproducibility on CPU"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 25,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "To be or not to be the duke\n",
+ "as it is a proper strange death,\n",
+ "and the\n"
+ ]
+ }
+ ],
"source": [
- "print(complete_text(\"t\", temperature=1))"
+ "print(extend_text(\"To be or not to be\", temperature=0.01))"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 26,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "To be or not to behold?\n",
+ "\n",
+ "second push:\n",
+ "gremio, lord all, a sistermen,\n"
+ ]
+ }
+ ],
"source": [
- "print(complete_text(\"t\", temperature=2))"
+ "print(extend_text(\"To be or not to be\", temperature=1))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "To be or not to bef ,mt'&o3fpadm!$\n",
+ "wh!nse?bws3est--vgerdjw?c-y-ewznq\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(extend_text(\"To be or not to be\", temperature=100))"
]
},
{
@@ -592,79 +746,124 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
- "tf.random.set_seed(42)"
+ "def to_dataset_for_stateful_rnn(sequence, length):\n",
+ " ds = tf.data.Dataset.from_tensor_slices(sequence)\n",
+ " ds = ds.window(length + 1, shift=length, drop_remainder=True)\n",
+ " ds = ds.flat_map(lambda window: window.batch(length + 1)).batch(1)\n",
+ " return ds.map(lambda window: (window[:, :-1], window[:, 1:])).prefetch(1)\n",
+ "\n",
+ "stateful_train_set = to_dataset_for_stateful_rnn(encoded[:1_000_000], length)\n",
+ "stateful_valid_set = to_dataset_for_stateful_rnn(encoded[:1_000_000], length)\n",
+ "stateful_test_set = to_dataset_for_stateful_rnn(encoded[1_060_000:], length)"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 29,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[(,\n",
+ " ),\n",
+ " (,\n",
+ " ),\n",
+ " (,\n",
+ " )]"
+ ]
+ },
+ "execution_count": 29,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size])\n",
- "dataset = dataset.window(window_length, shift=n_steps, drop_remainder=True)\n",
- "dataset = dataset.flat_map(lambda window: window.batch(window_length))\n",
- "dataset = dataset.batch(1)\n",
- "dataset = dataset.map(lambda windows: (windows[:, :-1], windows[:, 1:]))\n",
- "dataset = dataset.map(\n",
- " lambda X_batch, Y_batch: (tf.one_hot(X_batch, depth=max_id), Y_batch))\n",
- "dataset = dataset.prefetch(1)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "batch_size = 32\n",
- "encoded_parts = np.array_split(encoded[:train_size], batch_size)\n",
- "datasets = []\n",
- "for encoded_part in encoded_parts:\n",
- " dataset = tf.data.Dataset.from_tensor_slices(encoded_part)\n",
- " dataset = dataset.window(window_length, shift=n_steps, drop_remainder=True)\n",
- " dataset = dataset.flat_map(lambda window: window.batch(window_length))\n",
- " datasets.append(dataset)\n",
- "dataset = tf.data.Dataset.zip(tuple(datasets)).map(lambda *windows: tf.stack(windows))\n",
- "dataset = dataset.map(lambda windows: (windows[:, :-1], windows[:, 1:]))\n",
- "dataset = dataset.map(\n",
- " lambda X_batch, Y_batch: (tf.one_hot(X_batch, depth=max_id), Y_batch))\n",
- "dataset = dataset.prefetch(1)"
+ "# extra code – simple example using to_dataset_for_stateful_rnn()\n",
+ "list(to_dataset_for_stateful_rnn(tf.range(10), 3))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "**Note**: once again, I commented out `recurrent_dropout=0.2` (compared to the book) so you can get GPU acceleration (if you have one)."
+ "If you'd like to have more than one window per batch, you can use the `to_batched_dataset_for_stateful_rnn()` function instead of `to_dataset_for_stateful_rnn()`:"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 30,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[(,\n",
+ " ),\n",
+ " (,\n",
+ " ),\n",
+ " (,\n",
+ " )]"
+ ]
+ },
+ "execution_count": 30,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# extra code – shows one way to prepare a batched dataset for a stateful RNN\n",
+ "\n",
+ "import numpy as np\n",
+ "\n",
+ "def to_non_overlapping_windows(sequence, length):\n",
+ " ds = tf.data.Dataset.from_tensor_slices(sequence)\n",
+ " ds = ds.window(length + 1, shift=length, drop_remainder=True)\n",
+ " return ds.flat_map(lambda window: window.batch(length + 1))\n",
+ "\n",
+ "def to_batched_dataset_for_stateful_rnn(sequence, length, batch_size=32):\n",
+ " parts = np.array_split(sequence, batch_size)\n",
+ " datasets = tuple(to_non_overlapping_windows(part, length) for part in parts)\n",
+ " ds = tf.data.Dataset.zip(datasets).map(lambda *windows: tf.stack(windows))\n",
+ " return ds.map(lambda window: (window[:, :-1], window[:, 1:])).prefetch(1)\n",
+ "\n",
+ "list(to_batched_dataset_for_stateful_rnn(tf.range(20), length=3, batch_size=2))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 31,
"metadata": {},
"outputs": [],
"source": [
+ "tf.random.set_seed(42) # extra code – ensures reproducibility on CPU\n",
"model = tf.keras.Sequential([\n",
- " tf.keras.layers.GRU(128, return_sequences=True, stateful=True,\n",
- " #dropout=0.2, recurrent_dropout=0.2,\n",
- " dropout=0.2,\n",
- " batch_input_shape=[batch_size, None, max_id]),\n",
- " tf.keras.layers.GRU(128, return_sequences=True, stateful=True,\n",
- " #dropout=0.2, recurrent_dropout=0.2),\n",
- " dropout=0.2),\n",
- " tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(max_id,\n",
- " activation=\"softmax\"))\n",
+ " tf.keras.layers.Embedding(input_dim=n_tokens, output_dim=16,\n",
+ " batch_input_shape=[1, None]),\n",
+ " tf.keras.layers.GRU(128, return_sequences=True, stateful=True),\n",
+ " tf.keras.layers.Dense(n_tokens, activation=\"softmax\")\n",
"])"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 32,
"metadata": {},
"outputs": [],
"source": [
@@ -675,33 +874,197 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 33,
"metadata": {},
"outputs": [],
"source": [
- "model.compile(loss=\"sparse_categorical_crossentropy\", optimizer=\"adam\")\n",
- "history = model.fit(dataset, epochs=50,\n",
- " callbacks=[ResetStatesCallback()])"
+ "# extra code – use a different directory to save the checkpoints\n",
+ "model_ckpt = tf.keras.callbacks.ModelCheckpoint(\n",
+ " \"my_stateful_shakespeare_model\",\n",
+ " monitor=\"val_accuracy\",\n",
+ " save_best_only=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "To use the model with different batch sizes, we need to create a stateless copy. We can get rid of dropout since it is only used during training:"
+ "**Warning**: the following cell will take a while to run (possibly an hour if you are not using a GPU)."
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 34,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "INFO:tensorflow:Assets written to: my_stateful_shakespeare_model/assets\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "9999/9999 [==============================] - 213s 21ms/step - loss: 1.8690 - accuracy: 0.4494 - val_loss: 1.7632 - val_accuracy: 0.4672\n",
+ "Epoch 2/10\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "INFO:tensorflow:Assets written to: my_stateful_shakespeare_model/assets\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "9999/9999 [==============================] - 211s 21ms/step - loss: 1.5635 - accuracy: 0.5284 - val_loss: 1.6334 - val_accuracy: 0.4994\n",
+ "Epoch 3/10\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "INFO:tensorflow:Assets written to: my_stateful_shakespeare_model/assets\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "9999/9999 [==============================] - 209s 21ms/step - loss: 1.4875 - accuracy: 0.5478 - val_loss: 1.5788 - val_accuracy: 0.5153\n",
+ "Epoch 4/10\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "INFO:tensorflow:Assets written to: my_stateful_shakespeare_model/assets\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "9999/9999 [==============================] - 208s 21ms/step - loss: 1.4483 - accuracy: 0.5579 - val_loss: 1.5471 - val_accuracy: 0.5236\n",
+ "Epoch 5/10\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "INFO:tensorflow:Assets written to: my_stateful_shakespeare_model/assets\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "9999/9999 [==============================] - 213s 21ms/step - loss: 1.4241 - accuracy: 0.5643 - val_loss: 1.5270 - val_accuracy: 0.5286\n",
+ "Epoch 6/10\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "INFO:tensorflow:Assets written to: my_stateful_shakespeare_model/assets\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "9999/9999 [==============================] - 215s 21ms/step - loss: 1.4074 - accuracy: 0.5686 - val_loss: 1.5109 - val_accuracy: 0.5338\n",
+ "Epoch 7/10\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "INFO:tensorflow:Assets written to: my_stateful_shakespeare_model/assets\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "9999/9999 [==============================] - 210s 21ms/step - loss: 1.3953 - accuracy: 0.5714 - val_loss: 1.5008 - val_accuracy: 0.5361\n",
+ "Epoch 8/10\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "INFO:tensorflow:Assets written to: my_stateful_shakespeare_model/assets\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "9999/9999 [==============================] - 212s 21ms/step - loss: 1.3863 - accuracy: 0.5737 - val_loss: 1.4938 - val_accuracy: 0.5381\n",
+ "Epoch 9/10\n",
+ "9999/9999 [==============================] - 207s 21ms/step - loss: 1.3790 - accuracy: 0.5757 - val_loss: 1.4890 - val_accuracy: 0.5380\n",
+ "Epoch 10/10\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "INFO:tensorflow:Assets written to: my_stateful_shakespeare_model/assets\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "9999/9999 [==============================] - 208s 21ms/step - loss: 1.3729 - accuracy: 0.5770 - val_loss: 1.4786 - val_accuracy: 0.5420\n"
+ ]
+ }
+ ],
+ "source": [
+ "model.compile(loss=\"sparse_categorical_crossentropy\", optimizer=\"nadam\",\n",
+ " metrics=[\"accuracy\"])\n",
+ "history = model.fit(stateful_train_set, validation_data=stateful_valid_set,\n",
+ " epochs=10, callbacks=[ResetStatesCallback(), model_ckpt])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**Extra Material: converting the stateful RNN to a stateless RNN and using it**"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "To use the model with different batch sizes, we need to create a stateless copy:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 35,
"metadata": {},
"outputs": [],
"source": [
"stateless_model = tf.keras.Sequential([\n",
- " tf.keras.layers.GRU(128, return_sequences=True, input_shape=[None, max_id]),\n",
+ " tf.keras.layers.Embedding(input_dim=n_tokens, output_dim=16),\n",
" tf.keras.layers.GRU(128, return_sequences=True),\n",
- " tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(max_id,\n",
- " activation=\"softmax\"))\n",
+ " tf.keras.layers.Dense(n_tokens, activation=\"softmax\")\n",
"])"
]
},
@@ -714,32 +1077,53 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 36,
"metadata": {},
"outputs": [],
"source": [
- "stateless_model.build(tf.TensorShape([None, None, max_id]))"
+ "stateless_model.build(tf.TensorShape([None, None]))"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 37,
"metadata": {},
"outputs": [],
"source": [
- "stateless_model.set_weights(model.get_weights())\n",
- "model = stateless_model"
+ "stateless_model.set_weights(model.get_weights())"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 38,
"metadata": {},
"outputs": [],
+ "source": [
+ "shakespeare_model = tf.keras.Sequential([\n",
+ " text_vec_layer,\n",
+ " tf.keras.layers.Lambda(lambda X: X - 2), # no or tokens\n",
+ " stateless_model\n",
+ "])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 39,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "to be or not to be so in the world and the strangeness\n",
+ "to see the wo\n"
+ ]
+ }
+ ],
"source": [
"tf.random.set_seed(42)\n",
"\n",
- "print(complete_text(\"t\"))"
+ "print(extend_text(\"to be or not to be\", temperature=0.01))"
]
},
{
@@ -751,243 +1135,286 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 40,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[1mDownloading and preparing dataset 80.23 MiB (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to /home/ageron/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...\u001b[0m\n"
+ ]
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "055c0f544ac349d9a14da8f843651df0",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "Dl Completed...: 0 url [00:00, ? url/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "e2abc244f4844d56919979b33cc2fa79",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "Dl Size...: 0 MiB [00:00, ? MiB/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "af507eed124c4ff6900538205b1b00fd",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "Generating splits...: 0%| | 0/3 [00:00, ? splits/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "18cd596aa97b46f1aa3f93d0c29edd59",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "Generating train examples...: 0%| | 0/25000 [00:00, ? examples/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "7c623038199e46909b7a8b0a39cecbab",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "Shuffling /home/ageron/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incomplete0WPKUH/imdb_reviews-train.t…"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "b2c0a46cc37b4eb6b9feb67d715d7022",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "Generating test examples...: 0%| | 0/25000 [00:00, ? examples/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "4abb656c416049c085e0f2f761d5bf9c",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "Shuffling /home/ageron/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incomplete0WPKUH/imdb_reviews-test.tf…"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "edb7ceb384634b8ebd766e55ba21c5d4",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "Generating unsupervised examples...: 0%| | 0/50000 [00:00, ? examples/s]"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "ad80e1205d5e4914840999fcd3ae3b88",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "Shuffling /home/ageron/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incomplete0WPKUH/imdb_reviews-unsuper…"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[1mDataset imdb_reviews downloaded and prepared to /home/ageron/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.\u001b[0m\n"
+ ]
+ }
+ ],
+ "source": [
+ "import tensorflow_datasets as tfds\n",
+ "\n",
+ "raw_train_set, raw_valid_set, raw_test_set = tfds.load(\n",
+ " name=\"imdb_reviews\",\n",
+ " split=[\"train[:90%]\", \"train[90%:]\", \"test\"],\n",
+ " as_supervised=True\n",
+ ")\n",
+ "tf.random.set_seed(42)\n",
+ "train_set = raw_train_set.shuffle(5000, seed=42).batch(32).prefetch(1)\n",
+ "valid_set = raw_valid_set.batch(32).prefetch(1)\n",
+ "test_set = raw_test_set.batch(32).prefetch(1)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 41,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting ...\n",
+ "Label: 0\n",
+ "I have been known to fall asleep during films, but this is usually due to a combination of things including, really tired, being warm and comfortable on the sette and having just eaten a lot. However ...\n",
+ "Label: 0\n",
+ "Mann photographs the Alberta Rocky Mountains in a superb fashion, and Jimmy Stewart and Walter Brennan give enjoyable performances as they always seem to do.
But come on Hollywood - a Moun ...\n",
+ "Label: 0\n",
+ "This is the kind of film for a snowy Sunday afternoon when the rest of the world can go ahead with its own business as you descend into a big arm-chair and mellow for a couple of hours. Wonderful perf ...\n",
+ "Label: 1\n"
+ ]
+ }
+ ],
+ "source": [
+ "for review, label in raw_train_set.take(4):\n",
+ " print(review.numpy().decode(\"utf-8\")[:200], \"...\")\n",
+ " print(\"Label:\", label.numpy())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 42,
"metadata": {},
"outputs": [],
"source": [
- "tf.random.set_seed(42)"
+ "vocab_size = 1000\n",
+ "text_vec_layer = tf.keras.layers.TextVectorization(max_tokens=vocab_size)\n",
+ "text_vec_layer.adapt(train_set.map(lambda reviews, labels: reviews))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "You can load the IMDB dataset easily:"
+ "**Warning**: the following cell will take a few minutes to run and the model will probably not learn anything because we didn't mask the padding tokens (that's the point of the next section)."
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 43,
"metadata": {},
- "outputs": [],
- "source": [
- "(X_train, y_train), (X_test, y_test) = tf.keras.datasets.imdb.load_data()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "X_train[0][:10]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "word_index = tf.keras.datasets.imdb.get_word_index()\n",
- "id_to_word = {id_ + 3: word for word, id_ in word_index.items()}\n",
- "for id_, token in enumerate((\"\", \"\", \"\")):\n",
- " id_to_word[id_] = token\n",
- "\" \".join([id_to_word[id_] for id_ in X_train[0][:10]])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "import tensorflow_datasets as tfds\n",
- "\n",
- "datasets, info = tfds.load(\"imdb_reviews\", as_supervised=True, with_info=True)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "datasets.keys()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "train_size = info.splits[\"train\"].num_examples\n",
- "test_size = info.splits[\"test\"].num_examples"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "train_size, test_size"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "for X_batch, y_batch in datasets[\"train\"].batch(2).take(1):\n",
- " for review, label in zip(X_batch.numpy(), y_batch.numpy()):\n",
- " print(\"Review:\", review.decode(\"utf-8\")[:200], \"...\")\n",
- " print(\"Label:\", label, \"= Positive\" if label else \"= Negative\")\n",
- " print()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "def preprocess(X_batch, y_batch):\n",
- " X_batch = tf.strings.substr(X_batch, 0, 300)\n",
- " X_batch = tf.strings.regex_replace(X_batch, rb\"
\", b\" \")\n",
- " X_batch = tf.strings.regex_replace(X_batch, b\"[^a-zA-Z']\", b\" \")\n",
- " X_batch = tf.strings.split(X_batch)\n",
- " return X_batch.to_tensor(default_value=b\"\"), y_batch"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "preprocess(X_batch, y_batch)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "from collections import Counter\n",
- "\n",
- "vocabulary = Counter()\n",
- "for X_batch, y_batch in datasets[\"train\"].batch(32).map(preprocess):\n",
- " for review in X_batch:\n",
- " vocabulary.update(list(review.numpy()))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "vocabulary.most_common()[:3]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "len(vocabulary)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "vocab_size = 10000\n",
- "truncated_vocabulary = [\n",
- " word for word, count in vocabulary.most_common()[:vocab_size]]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "word_to_id = {word: index for index, word in enumerate(truncated_vocabulary)}\n",
- "for word in b\"This movie was faaaaaantastic\".split():\n",
- " print(word_to_id.get(word) or vocab_size)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "words = tf.constant(truncated_vocabulary)\n",
- "word_ids = tf.range(len(truncated_vocabulary), dtype=tf.int64)\n",
- "vocab_init = tf.lookup.KeyValueTensorInitializer(words, word_ids)\n",
- "num_oov_buckets = 1000\n",
- "table = tf.lookup.StaticVocabularyTable(vocab_init, num_oov_buckets)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "table.lookup(tf.constant([b\"This movie was faaaaaantastic\".split()]))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "def encode_words(X_batch, y_batch):\n",
- " return table.lookup(X_batch), y_batch\n",
- "\n",
- "train_set = datasets[\"train\"].batch(32).map(preprocess)\n",
- "train_set = train_set.map(encode_words).prefetch(1)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "for X_batch, y_batch in train_set.take(1):\n",
- " print(X_batch)\n",
- " print(y_batch)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Epoch 1/2\n",
+ "704/704 [==============================] - 255s 359ms/step - loss: 0.6934 - accuracy: 0.4990 - val_loss: 0.6931 - val_accuracy: 0.5016\n",
+ "Epoch 2/2\n",
+ "704/704 [==============================] - 250s 355ms/step - loss: 0.6934 - accuracy: 0.5042 - val_loss: 0.6942 - val_accuracy: 0.5008\n"
+ ]
+ }
+ ],
"source": [
"embed_size = 128\n",
+ "tf.random.set_seed(42)\n",
"model = tf.keras.Sequential([\n",
- " tf.keras.layers.Embedding(vocab_size + num_oov_buckets, embed_size,\n",
- " mask_zero=True, # not shown in the book\n",
- " input_shape=[None]),\n",
- " tf.keras.layers.GRU(128, return_sequences=True),\n",
+ " text_vec_layer,\n",
+ " tf.keras.layers.Embedding(vocab_size, embed_size),\n",
" tf.keras.layers.GRU(128),\n",
" tf.keras.layers.Dense(1, activation=\"sigmoid\")\n",
"])\n",
- "model.compile(loss=\"binary_crossentropy\", optimizer=\"adam\", metrics=[\"accuracy\"])\n",
- "history = model.fit(train_set, epochs=5)"
+ "model.compile(loss=\"binary_crossentropy\", optimizer=\"nadam\",\n",
+ " metrics=[\"accuracy\"])\n",
+ "history = model.fit(train_set, validation_data=valid_set, epochs=2)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Masking"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**Warning**: the following cell will take a while to run (possibly 30 minutes if you are not using a GPU)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 44,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Epoch 1/5\n",
+ "704/704 [==============================] - 303s 426ms/step - loss: 0.5296 - accuracy: 0.7234 - val_loss: 0.4045 - val_accuracy: 0.8244\n",
+ "Epoch 2/5\n",
+ "704/704 [==============================] - 295s 419ms/step - loss: 0.3702 - accuracy: 0.8418 - val_loss: 0.3390 - val_accuracy: 0.8532\n",
+ "Epoch 3/5\n",
+ "704/704 [==============================] - 298s 423ms/step - loss: 0.3057 - accuracy: 0.8747 - val_loss: 0.3196 - val_accuracy: 0.8696\n",
+ "Epoch 4/5\n",
+ "704/704 [==============================] - 294s 418ms/step - loss: 0.2784 - accuracy: 0.8871 - val_loss: 0.3162 - val_accuracy: 0.8596\n",
+ "Epoch 5/5\n",
+ "704/704 [==============================] - 293s 417ms/step - loss: 0.2597 - accuracy: 0.8961 - val_loss: 0.3209 - val_accuracy: 0.8548\n"
+ ]
+ }
+ ],
+ "source": [
+ "embed_size = 128\n",
+ "tf.random.set_seed(42)\n",
+ "model = tf.keras.Sequential([\n",
+ " text_vec_layer,\n",
+ " tf.keras.layers.Embedding(vocab_size, embed_size, mask_zero=True),\n",
+ " tf.keras.layers.GRU(128),\n",
+ " tf.keras.layers.Dense(1, activation=\"sigmoid\")\n",
+ "])\n",
+ "model.compile(loss=\"binary_crossentropy\", optimizer=\"nadam\",\n",
+ " metrics=[\"accuracy\"])\n",
+ "history = model.fit(train_set, validation_data=valid_set, epochs=5)"
]
},
{
@@ -999,405 +1426,1497 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 45,
"metadata": {},
"outputs": [],
"source": [
- "K = tf.keras.backend\n",
- "embed_size = 128\n",
- "inputs = tf.keras.layers.Input(shape=[None])\n",
- "mask = tf.keras.layers.Lambda(lambda inputs: K.not_equal(inputs, 0))(inputs)\n",
- "z = tf.keras.layers.Embedding(vocab_size + num_oov_buckets, embed_size)(inputs)\n",
- "z = tf.keras.layers.GRU(128, return_sequences=True)(z, mask=mask)\n",
- "z = tf.keras.layers.GRU(128)(z, mask=mask)\n",
- "outputs = tf.keras.layers.Dense(1, activation=\"sigmoid\")(z)\n",
- "model = tf.keras.Model(inputs=[inputs], outputs=[outputs])\n",
- "model.compile(loss=\"binary_crossentropy\", optimizer=\"adam\", metrics=[\"accuracy\"])\n",
- "history = model.fit(train_set, epochs=5)"
+ "tf.random.set_seed(42) # extra code – ensures reproducibility on the CPU\n",
+ "inputs = tf.keras.layers.Input(shape=[], dtype=tf.string)\n",
+ "token_ids = text_vec_layer(inputs)\n",
+ "mask = tf.math.not_equal(token_ids, 0)\n",
+ "Z = tf.keras.layers.Embedding(vocab_size, embed_size)(token_ids)\n",
+ "Z = tf.keras.layers.GRU(128, dropout=0.2)(Z, mask=mask)\n",
+ "outputs = tf.keras.layers.Dense(1, activation=\"sigmoid\")(Z)\n",
+ "model = tf.keras.Model(inputs=[inputs], outputs=[outputs])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "## Reusing Pretrained Embeddings"
+ "**Warning**: the following cell will take a while to run (possibly 30 minutes if you are not using a GPU)."
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 46,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Epoch 1/5\n",
+ "704/704 [==============================] - 303s 427ms/step - loss: 0.5447 - accuracy: 0.7198 - val_loss: 0.4604 - val_accuracy: 0.7720\n",
+ "Epoch 2/5\n",
+ "704/704 [==============================] - 301s 427ms/step - loss: 0.3469 - accuracy: 0.8512 - val_loss: 0.3214 - val_accuracy: 0.8608\n",
+ "Epoch 3/5\n",
+ "704/704 [==============================] - 295s 419ms/step - loss: 0.3054 - accuracy: 0.8713 - val_loss: 0.3069 - val_accuracy: 0.8672\n",
+ "Epoch 4/5\n",
+ "704/704 [==============================] - 295s 420ms/step - loss: 0.2798 - accuracy: 0.8828 - val_loss: 0.3028 - val_accuracy: 0.8672\n",
+ "Epoch 5/5\n",
+ "704/704 [==============================] - 298s 423ms/step - loss: 0.2622 - accuracy: 0.8920 - val_loss: 0.2953 - val_accuracy: 0.8700\n"
+ ]
+ }
+ ],
"source": [
- "tf.random.set_seed(42)"
+ "# extra code – compiles and trains the model, as usual\n",
+ "model.compile(loss=\"binary_crossentropy\", optimizer=\"nadam\",\n",
+ " metrics=[\"accuracy\"])\n",
+ "history = model.fit(train_set, validation_data=valid_set, epochs=5)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**Extra material: using ragged tensors**"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 47,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 47,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "tfhub_cache_dir = Path() / \"my_tfhub_cache\"\n",
- "os.environ[\"TFHUB_CACHE_DIR\"] = str(tfhub_cache_dir)"
+ "text_vec_layer_ragged = tf.keras.layers.TextVectorization(\n",
+ " max_tokens=vocab_size, ragged=True)\n",
+ "text_vec_layer_ragged.adapt(train_set.map(lambda reviews, labels: reviews))\n",
+ "text_vec_layer_ragged([\"Great movie!\", \"This is DiCaprio's best role.\"])"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 48,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 48,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "import tensorflow_hub as hub\n",
- "\n",
+ "text_vec_layer([\"Great movie!\", \"This is DiCaprio's best role.\"])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**Warning**: the following cell will take a while to run (possibly 30 minutes if you are not using a GPU)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 49,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Epoch 1/5\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "704/704 [==============================] - 280s 395ms/step - loss: 0.5038 - accuracy: 0.7496 - val_loss: 0.6706 - val_accuracy: 0.6752\n",
+ "Epoch 2/5\n",
+ "704/704 [==============================] - 277s 393ms/step - loss: 0.4499 - accuracy: 0.7892 - val_loss: 0.3494 - val_accuracy: 0.8500\n",
+ "Epoch 3/5\n",
+ "704/704 [==============================] - 276s 392ms/step - loss: 0.3270 - accuracy: 0.8592 - val_loss: 0.3855 - val_accuracy: 0.8260\n",
+ "Epoch 4/5\n",
+ "704/704 [==============================] - 277s 394ms/step - loss: 0.2935 - accuracy: 0.8760 - val_loss: 0.3401 - val_accuracy: 0.8520\n",
+ "Epoch 5/5\n",
+ "704/704 [==============================] - 275s 390ms/step - loss: 0.2742 - accuracy: 0.8854 - val_loss: 0.3971 - val_accuracy: 0.8208\n"
+ ]
+ }
+ ],
+ "source": [
+ "embed_size = 128\n",
+ "tf.random.set_seed(42)\n",
"model = tf.keras.Sequential([\n",
- " hub.KerasLayer(\"https://tfhub.dev/google/tf2-preview/nnlm-en-dim50/1\",\n",
- " dtype=tf.string, input_shape=[], output_shape=[50]),\n",
- " tf.keras.layers.Dense(128, activation=\"relu\"),\n",
+ " text_vec_layer_ragged,\n",
+ " tf.keras.layers.Embedding(vocab_size, embed_size),\n",
+ " tf.keras.layers.GRU(128),\n",
" tf.keras.layers.Dense(1, activation=\"sigmoid\")\n",
"])\n",
- "model.compile(loss=\"binary_crossentropy\", optimizer=\"adam\",\n",
- " metrics=[\"accuracy\"])"
+ "model.compile(loss=\"binary_crossentropy\", optimizer=\"nadam\",\n",
+ " metrics=[\"accuracy\"])\n",
+ "history = model.fit(train_set, validation_data=valid_set, epochs=5)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "Let's define a `tree()` function to view the structure of the cache directory TF Hub just created:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "def tree(path, level=0, indent=4):\n",
- " if level == 0:\n",
- " print(f\"{path}/\")\n",
- " level += 1\n",
- " sub_paths = sorted(path.iterdir())\n",
- " sub_dirs = [sub_path for sub_path in sub_paths if sub_path.is_dir()]\n",
- " filepaths = [sub_path for sub_path in sub_paths if not sub_path in sub_dirs]\n",
- " indent_str = \" \" * indent * level\n",
- " for sub_dir in sub_dirs:\n",
- " print(f\"{indent_str}{sub_dir.name}/\")\n",
- " tree(sub_dir, level + 1, indent)\n",
- " for filepath in filepaths:\n",
- " print(f\"{indent_str}{filepath.name}\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "tree(tfhub_cache_dir)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "import tensorflow_datasets as tfds\n",
- "\n",
- "datasets, info = tfds.load(\"imdb_reviews\", as_supervised=True, with_info=True)\n",
- "train_size = info.splits[\"train\"].num_examples\n",
- "batch_size = 32\n",
- "train_set = datasets[\"train\"].batch(batch_size).prefetch(1)\n",
- "history = model.fit(train_set, epochs=5)"
+ "## Reusing Pretrained Embeddings and Language Models"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "## Automatic Translation"
+ "**Warning**: the following cell will take a while to run (possibly an hour if you are not using a GPU)."
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 50,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Epoch 1/10\n",
+ "704/704 [==============================] - 224s 303ms/step - loss: 0.3141 - accuracy: 0.8648 - val_loss: 0.2397 - val_accuracy: 0.9008\n",
+ "Epoch 2/10\n",
+ "704/704 [==============================] - 205s 291ms/step - loss: 0.0489 - accuracy: 0.9852 - val_loss: 0.3257 - val_accuracy: 0.8936\n",
+ "Epoch 3/10\n",
+ "704/704 [==============================] - 204s 290ms/step - loss: 0.0061 - accuracy: 0.9988 - val_loss: 0.3963 - val_accuracy: 0.8944\n",
+ "Epoch 4/10\n",
+ "704/704 [==============================] - 204s 290ms/step - loss: 9.4918e-04 - accuracy: 0.9999 - val_loss: 0.4291 - val_accuracy: 0.8924\n",
+ "Epoch 5/10\n",
+ "704/704 [==============================] - 203s 289ms/step - loss: 5.1920e-04 - accuracy: 1.0000 - val_loss: 0.4691 - val_accuracy: 0.8932\n",
+ "Epoch 6/10\n",
+ "704/704 [==============================] - 204s 289ms/step - loss: 5.0053e-04 - accuracy: 1.0000 - val_loss: 0.4687 - val_accuracy: 0.8912\n",
+ "Epoch 7/10\n",
+ "704/704 [==============================] - 208s 296ms/step - loss: 3.7360e-04 - accuracy: 1.0000 - val_loss: 0.5034 - val_accuracy: 0.8984\n",
+ "Epoch 8/10\n",
+ "704/704 [==============================] - 209s 297ms/step - loss: 2.3907e-05 - accuracy: 1.0000 - val_loss: 0.5773 - val_accuracy: 0.8924\n",
+ "Epoch 9/10\n",
+ "704/704 [==============================] - 204s 290ms/step - loss: 9.0970e-06 - accuracy: 1.0000 - val_loss: 0.6163 - val_accuracy: 0.8972\n",
+ "Epoch 10/10\n",
+ "704/704 [==============================] - 205s 291ms/step - loss: 5.2528e-06 - accuracy: 1.0000 - val_loss: 0.6455 - val_accuracy: 0.8956\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 50,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "tf.random.set_seed(42)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "vocab_size = 100\n",
- "embed_size = 10"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "import tensorflow_addons as tfa\n",
+ "import os\n",
+ "import tensorflow_hub as hub\n",
"\n",
- "encoder_inputs = tf.keras.layers.Input(shape=[None], dtype=np.int32)\n",
- "decoder_inputs = tf.keras.layers.Input(shape=[None], dtype=np.int32)\n",
- "sequence_lengths = tf.keras.layers.Input(shape=[], dtype=np.int32)\n",
- "\n",
- "embeddings = tf.keras.layers.Embedding(vocab_size, embed_size)\n",
- "encoder_embeddings = embeddings(encoder_inputs)\n",
- "decoder_embeddings = embeddings(decoder_inputs)\n",
- "\n",
- "encoder = tf.keras.layers.LSTM(512, return_state=True)\n",
- "encoder_outputs, state_h, state_c = encoder(encoder_embeddings)\n",
- "encoder_state = [state_h, state_c]\n",
- "\n",
- "sampler = tfa.seq2seq.sampler.TrainingSampler()\n",
- "\n",
- "decoder_cell = tf.keras.layers.LSTMCell(512)\n",
- "output_layer = tf.keras.layers.Dense(vocab_size)\n",
- "decoder = tfa.seq2seq.basic_decoder.BasicDecoder(decoder_cell, sampler,\n",
- " output_layer=output_layer)\n",
- "final_outputs, final_state, final_sequence_lengths = decoder(\n",
- " decoder_embeddings, initial_state=encoder_state,\n",
- " sequence_length=sequence_lengths)\n",
- "Y_proba = tf.nn.softmax(final_outputs.rnn_output)\n",
- "\n",
- "model = tf.keras.Model(\n",
- " inputs=[encoder_inputs, decoder_inputs, sequence_lengths],\n",
- " outputs=[Y_proba])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "model.compile(loss=\"sparse_categorical_crossentropy\", optimizer=\"adam\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "X = np.random.randint(100, size=1000 * 10).reshape(1000, 10)\n",
- "Y = np.random.randint(100, size=1000 * 15).reshape(1000, 15)\n",
- "X_decoder = np.c_[np.zeros((1000, 1)), Y[:, :-1]]\n",
- "seq_lengths = np.full([1000], 10)\n",
- "\n",
- "history = model.fit([X, X_decoder, seq_lengths], Y, epochs=2)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Bidirectional Recurrent Layers"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
+ "os.environ[\"TFHUB_CACHE_DIR\"] = \"my_tfhub_cache\"\n",
+ "tf.random.set_seed(42) # extra code – ensures reproducibility on CPU\n",
"model = tf.keras.Sequential([\n",
- " tf.keras.layers.GRU(10, return_sequences=True, input_shape=[None, 10]),\n",
- " tf.keras.layers.Bidirectional(tf.keras.layers.GRU(10, return_sequences=True))\n",
+ " hub.KerasLayer(\"https://tfhub.dev/google/universal-sentence-encoder/4\",\n",
+ " trainable=True, dtype=tf.string, input_shape=[]),\n",
+ " tf.keras.layers.Dense(64, activation=\"relu\"),\n",
+ " tf.keras.layers.Dense(1, activation=\"sigmoid\")\n",
"])\n",
- "\n",
- "model.summary()"
+ "model.compile(loss=\"binary_crossentropy\", optimizer=\"nadam\",\n",
+ " metrics=[\"accuracy\"])\n",
+ "model.fit(train_set, validation_data=valid_set, epochs=10)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "### Positional Encoding"
+ "# An Encoder–Decoder Network for Neural Machine Translation"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 51,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "url = \"https://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip\"\n",
+ "path = tf.keras.utils.get_file(\"spa-eng.zip\", origin=url, cache_dir=\"datasets\",\n",
+ " extract=True)\n",
+ "text = (Path(path).with_name(\"spa-eng\") / \"spa.txt\").read_text()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 52,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import numpy as np\n",
+ "\n",
+ "text = text.replace(\"¡\", \"\").replace(\"¿\", \"\")\n",
+ "pairs = [line.split(\"\\t\") for line in text.splitlines()]\n",
+ "np.random.seed(42) # extra code – ensures reproducibility on CPU\n",
+ "np.random.shuffle(pairs)\n",
+ "sentences_en, sentences_es = zip(*pairs) # separates the pairs into 2 lists"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 53,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "How boring! => Qué aburrimiento!\n",
+ "I love sports. => Adoro el deporte.\n",
+ "Would you like to swap jobs? => Te gustaría que intercambiemos los trabajos?\n"
+ ]
+ }
+ ],
+ "source": [
+ "for i in range(3):\n",
+ " print(sentences_en[i], \"=>\", sentences_es[i])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 54,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "vocab_size = 1000\n",
+ "max_length = 50\n",
+ "text_vec_layer_en = tf.keras.layers.TextVectorization(\n",
+ " vocab_size, output_sequence_length=max_length)\n",
+ "text_vec_layer_es = tf.keras.layers.TextVectorization(\n",
+ " vocab_size, output_sequence_length=max_length)\n",
+ "text_vec_layer_en.adapt(sentences_en)\n",
+ "text_vec_layer_es.adapt([f\"startofseq {s} endofseq\" for s in sentences_es])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 55,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['', '[UNK]', 'the', 'i', 'to', 'you', 'tom', 'a', 'is', 'he']"
+ ]
+ },
+ "execution_count": 55,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "text_vec_layer_en.get_vocabulary()[:10]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 56,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['', '[UNK]', 'startofseq', 'endofseq', 'de', 'que', 'a', 'no', 'tom', 'la']"
+ ]
+ },
+ "execution_count": 56,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "text_vec_layer_es.get_vocabulary()[:10]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 57,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "X_train = tf.constant(sentences_en[:100_000])\n",
+ "X_valid = tf.constant(sentences_en[100_000:])\n",
+ "X_train_dec = tf.constant([f\"startofseq {s}\" for s in sentences_es[:100_000]])\n",
+ "X_valid_dec = tf.constant([f\"startofseq {s}\" for s in sentences_es[100_000:]])\n",
+ "Y_train = text_vec_layer_es([f\"{s} endofseq\" for s in sentences_es[:100_000]])\n",
+ "Y_valid = text_vec_layer_es([f\"{s} endofseq\" for s in sentences_es[100_000:]])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 58,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "tf.random.set_seed(42) # extra code – ensures reproducibility on CPU\n",
+ "encoder_inputs = tf.keras.layers.Input(shape=[], dtype=tf.string)\n",
+ "decoder_inputs = tf.keras.layers.Input(shape=[], dtype=tf.string)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 59,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "embed_size = 128\n",
+ "encoder_input_ids = text_vec_layer_en(encoder_inputs)\n",
+ "decoder_input_ids = text_vec_layer_es(decoder_inputs)\n",
+ "encoder_embedding_layer = tf.keras.layers.Embedding(vocab_size, embed_size,\n",
+ " mask_zero=True)\n",
+ "decoder_embedding_layer = tf.keras.layers.Embedding(vocab_size, embed_size,\n",
+ " mask_zero=True)\n",
+ "encoder_embeddings = encoder_embedding_layer(encoder_input_ids)\n",
+ "decoder_embeddings = decoder_embedding_layer(decoder_input_ids)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 60,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "encoder = tf.keras.layers.LSTM(512, return_state=True)\n",
+ "encoder_outputs, *encoder_state = encoder(encoder_embeddings)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 61,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "decoder = tf.keras.layers.LSTM(512, return_sequences=True)\n",
+ "decoder_outputs = decoder(decoder_embeddings, initial_state=encoder_state)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 62,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "output_layer = tf.keras.layers.Dense(vocab_size, activation=\"softmax\")\n",
+ "Y_proba = output_layer(decoder_outputs)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**Warning**: the following cell will take a while to run (possibly a couple hours if you are not using a GPU)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 63,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Epoch 1/10\n",
+ "3125/3125 [==============================] - 698s 221ms/step - loss: 0.4154 - accuracy: 0.4256 - val_loss: 0.3069 - val_accuracy: 0.5246\n",
+ "Epoch 2/10\n",
+ "3125/3125 [==============================] - 686s 219ms/step - loss: 0.2631 - accuracy: 0.5745 - val_loss: 0.2367 - val_accuracy: 0.6055\n",
+ "Epoch 3/10\n",
+ "3125/3125 [==============================] - 686s 220ms/step - loss: 0.2066 - accuracy: 0.6457 - val_loss: 0.2061 - val_accuracy: 0.6500\n",
+ "Epoch 4/10\n",
+ "3125/3125 [==============================] - 682s 218ms/step - loss: 0.1740 - accuracy: 0.6907 - val_loss: 0.1920 - val_accuracy: 0.6691\n",
+ "Epoch 5/10\n",
+ "3125/3125 [==============================] - 676s 216ms/step - loss: 0.1507 - accuracy: 0.7237 - val_loss: 0.1865 - val_accuracy: 0.6767\n",
+ "Epoch 6/10\n",
+ "3125/3125 [==============================] - 675s 216ms/step - loss: 0.1316 - accuracy: 0.7522 - val_loss: 0.1847 - val_accuracy: 0.6804\n",
+ "Epoch 7/10\n",
+ "3125/3125 [==============================] - 675s 216ms/step - loss: 0.1154 - accuracy: 0.7774 - val_loss: 0.1866 - val_accuracy: 0.6822\n",
+ "Epoch 8/10\n",
+ "3125/3125 [==============================] - 673s 215ms/step - loss: 0.1011 - accuracy: 0.8007 - val_loss: 0.1907 - val_accuracy: 0.6829\n",
+ "Epoch 9/10\n",
+ "3125/3125 [==============================] - 673s 215ms/step - loss: 0.0888 - accuracy: 0.8215 - val_loss: 0.1961 - val_accuracy: 0.6792\n",
+ "Epoch 10/10\n",
+ "3125/3125 [==============================] - 673s 215ms/step - loss: 0.0782 - accuracy: 0.8402 - val_loss: 0.2027 - val_accuracy: 0.6763\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 63,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "model = tf.keras.Model(inputs=[encoder_inputs, decoder_inputs],\n",
+ " outputs=[Y_proba])\n",
+ "model.compile(loss=\"sparse_categorical_crossentropy\", optimizer=\"nadam\",\n",
+ " metrics=[\"accuracy\"])\n",
+ "model.fit((X_train, X_train_dec), Y_train, epochs=10,\n",
+ " validation_data=((X_valid, X_valid_dec), Y_valid))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 64,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def translate(sentence_en):\n",
+ " translation = \"\"\n",
+ " for word_idx in range(max_length):\n",
+ " X = np.array([sentence_en]) # encoder input \n",
+ " X_dec = np.array([\"startofseq \" + translation]) # decoder input\n",
+ " y_proba = model.predict((X, X_dec))[0, word_idx] # last token's probas\n",
+ " predicted_word_id = np.argmax(y_proba)\n",
+ " predicted_word = text_vec_layer_es.get_vocabulary()[predicted_word_id]\n",
+ " if predicted_word == \"endofseq\":\n",
+ " break\n",
+ " translation += \" \" + predicted_word\n",
+ " return translation.strip()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 65,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'me gusta el fútbol'"
+ ]
+ },
+ "execution_count": 65,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "translate(\"I like soccer\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Nice! However, the model struggles with longer sentences:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 66,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'me gusta el fútbol y a veces mismo al bus'"
+ ]
+ },
+ "execution_count": 66,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "translate(\"I like soccer and also going to the beach\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Bidirectional RNNs"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "To create a bidirectional recurrent layer, just wrap a regular recurrent layer in a `Bidirectional` layer:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 67,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "tf.random.set_seed(42) # extra code – ensures reproducibility on CPU\n",
+ "encoder = tf.keras.layers.Bidirectional(\n",
+ " tf.keras.layers.LSTM(256, return_state=True))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 68,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "encoder_outputs, *encoder_state = encoder(encoder_embeddings)\n",
+ "encoder_state = [tf.concat(encoder_state[::2], axis=-1), # short-term (0 & 2)\n",
+ " tf.concat(encoder_state[1::2], axis=-1)] # long-term (1 & 3)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**Warning**: the following cell will take a while to run (possibly a couple hours if you are not using a GPU)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 69,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Epoch 1/10\n",
+ "3125/3125 [==============================] - 574s 181ms/step - loss: 0.3075 - accuracy: 0.5393 - val_loss: 0.2192 - val_accuracy: 0.6319\n",
+ "Epoch 2/10\n",
+ "3125/3125 [==============================] - 564s 180ms/step - loss: 0.1916 - accuracy: 0.6689 - val_loss: 0.1880 - val_accuracy: 0.6731\n",
+ "Epoch 3/10\n",
+ "3125/3125 [==============================] - 566s 181ms/step - loss: 0.1602 - accuracy: 0.7119 - val_loss: 0.1751 - val_accuracy: 0.6916\n",
+ "Epoch 4/10\n",
+ "3125/3125 [==============================] - 566s 181ms/step - loss: 0.1395 - accuracy: 0.7415 - val_loss: 0.1715 - val_accuracy: 0.6979\n",
+ "Epoch 5/10\n",
+ "3125/3125 [==============================] - 566s 181ms/step - loss: 0.1227 - accuracy: 0.7666 - val_loss: 0.1707 - val_accuracy: 0.7025\n",
+ "Epoch 6/10\n",
+ "3125/3125 [==============================] - 567s 181ms/step - loss: 0.1085 - accuracy: 0.7887 - val_loss: 0.1730 - val_accuracy: 0.6995\n",
+ "Epoch 7/10\n",
+ "3125/3125 [==============================] - 571s 183ms/step - loss: 0.0961 - accuracy: 0.8089 - val_loss: 0.1764 - val_accuracy: 0.7000\n",
+ "Epoch 8/10\n",
+ "3125/3125 [==============================] - 567s 181ms/step - loss: 0.0852 - accuracy: 0.8273 - val_loss: 0.1821 - val_accuracy: 0.6981\n",
+ "Epoch 9/10\n",
+ "3125/3125 [==============================] - 565s 181ms/step - loss: 0.0759 - accuracy: 0.8438 - val_loss: 0.1881 - val_accuracy: 0.6956\n",
+ "Epoch 10/10\n",
+ "3125/3125 [==============================] - 565s 181ms/step - loss: 0.0682 - accuracy: 0.8577 - val_loss: 0.1951 - val_accuracy: 0.6906\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 69,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# extra code — completes the model and trains it\n",
+ "decoder = tf.keras.layers.LSTM(512, return_sequences=True)\n",
+ "decoder_outputs = decoder(decoder_embeddings, initial_state=encoder_state)\n",
+ "output_layer = tf.keras.layers.Dense(vocab_size, activation=\"softmax\")\n",
+ "Y_proba = output_layer(decoder_outputs)\n",
+ "model = tf.keras.Model(inputs=[encoder_inputs, decoder_inputs],\n",
+ " outputs=[Y_proba])\n",
+ "model.compile(loss=\"sparse_categorical_crossentropy\", optimizer=\"nadam\",\n",
+ " metrics=[\"accuracy\"])\n",
+ "model.fit((X_train, X_train_dec), Y_train, epochs=10,\n",
+ " validation_data=((X_valid, X_valid_dec), Y_valid))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 70,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'me gusta el fútbol'"
+ ]
+ },
+ "execution_count": 70,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "translate(\"I like soccer\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Beam Search"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "This is a very basic implementation of beam search. I tried to make it readable and understandable, but it's definitely not optimized for speed! The function first uses the model to find the top _k_ words to start the translations (where _k_ is the beam width). For each of the top _k_ translations, it evaluates the conditional probabilities of all possible words it could add to that translation. These extended translations and their probabilities are added to the list of candidates. Once we've gone through all top _k_ translations and all words that could complete them, we keep only the top _k_ candidates with the highest probability, and we iterate over and over until they all finish with an EOS token. The top translation is then returned (after removing its EOS token).\n",
+ "\n",
+ "* Note: If p(S) is the probability of sentence S, and p(W|S) is the conditional probability of the word W given that the translation starts with S, then the probability of the sentence S' = concat(S, W) is p(S') = p(S) * p(W|S). As we add more words, the probability gets smaller and smaller. To avoid the risk of it getting too small, which could cause floating point precision errors, the function keeps track of log probabilities instead of probabilities: recall that log(a\\*b) = log(a) + log(b), therefore log(p(S')) = log(p(S)) + log(p(W|S))."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 71,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# extra code – a basic implementation of beam search\n",
+ "\n",
+ "def beam_search(sentence_en, beam_width, verbose=False):\n",
+ " X = np.array([sentence_en]) # encoder input\n",
+ " X_dec = np.array([\"startofseq\"]) # decoder input\n",
+ " y_proba = model.predict((X, X_dec))[0, 0] # first token's probas\n",
+ " top_k = tf.math.top_k(y_proba, k=beam_width)\n",
+ " top_translations = [ # list of best (log_proba, translation)\n",
+ " (np.log(word_proba), text_vec_layer_es.get_vocabulary()[word_id])\n",
+ " for word_proba, word_id in zip(top_k.values, top_k.indices)\n",
+ " ]\n",
+ " \n",
+ " # extra code – displays the top first words in verbose mode\n",
+ " if verbose:\n",
+ " print(\"Top first words:\", top_translations)\n",
+ "\n",
+ " for idx in range(1, max_length):\n",
+ " candidates = []\n",
+ " for log_proba, translation in top_translations:\n",
+ " if translation.endswith(\"endofseq\"):\n",
+ " candidates.append((log_proba, translation))\n",
+ " continue # translation is finished, so don't try to extend it\n",
+ " X = np.array([sentence_en]) # encoder input\n",
+ " X_dec = np.array([\"startofseq \" + translation]) # decoder input\n",
+ " y_proba = model.predict((X, X_dec))[0, idx] # last token's proba\n",
+ " for word_id, word_proba in enumerate(y_proba):\n",
+ " word = text_vec_layer_es.get_vocabulary()[word_id]\n",
+ " candidates.append((log_proba + np.log(word_proba),\n",
+ " f\"{translation} {word}\"))\n",
+ " top_translations = sorted(candidates, reverse=True)[:beam_width]\n",
+ "\n",
+ " # extra code – displays the top translation so far in verbose mode\n",
+ " if verbose:\n",
+ " print(\"Top translations so far:\", top_translations)\n",
+ "\n",
+ " if all([tr.endswith(\"endofseq\") for _, tr in top_translations]):\n",
+ " return top_translations[0][1].replace(\"endofseq\", \"\").strip()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 72,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'me [UNK] los gatos y los gatos'"
+ ]
+ },
+ "execution_count": 72,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# extra code – shows how the model making an error\n",
+ "sentence_en = \"I love cats and dogs\"\n",
+ "translate(sentence_en)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 73,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Top first words: [(-0.012974381, 'me'), (-4.592527, '[UNK]'), (-6.314033, 'yo')]\n",
+ "Top translations so far: [(-0.4831518, 'me [UNK]'), (-1.4920667, 'me encanta'), (-1.986235, 'me gustan')]\n",
+ "Top translations so far: [(-0.6793061, 'me [UNK] los'), (-1.9889652, 'me gustan los'), (-2.0470557, 'me encanta los')]\n",
+ "Top translations so far: [(-0.7609749, 'me [UNK] los gatos'), (-2.0677316, 'me gustan los gatos'), (-2.26029, 'me encanta los gatos')]\n",
+ "Top translations so far: [(-0.76985043, 'me [UNK] los gatos y'), (-2.0701222, 'me gustan los gatos y'), (-2.2649746, 'me encanta los gatos y')]\n",
+ "Top translations so far: [(-0.81283045, 'me [UNK] los gatos y los'), (-2.118244, 'me gustan los gatos y los'), (-2.96167, 'me encanta los gatos y los')]\n",
+ "Top translations so far: [(-1.2259341, 'me [UNK] los gatos y los gatos'), (-1.9556838, 'me [UNK] los gatos y los perros'), (-2.7524388, 'me gustan los gatos y los perros')]\n",
+ "Top translations so far: [(-1.2261332, 'me [UNK] los gatos y los gatos endofseq'), (-1.9560521, 'me [UNK] los gatos y los perros endofseq'), (-2.7566314, 'me gustan los gatos y los perros endofseq')]\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "'me [UNK] los gatos y los gatos'"
+ ]
+ },
+ "execution_count": 73,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# extra code – shows how beam search can help\n",
+ "beam_search(sentence_en, beam_width=3, verbose=True)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The correct translation is in the top 3 sentences found by beam search, but it's not the first. Since we're using a small vocabulary, the \\[UNK] token is quite frequent, so you may want to penalize it (e.g., divide its probability by 2 in the beam search function): this will discourage beam search from using it too much."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Attention Mechanisms"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "We need to feed all the encoder's outputs to the `Attention` layer, so we must add `return_sequences=True` to the encoder:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 74,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "tf.random.set_seed(42) # extra code – ensures reproducibility on CPU\n",
+ "encoder = tf.keras.layers.Bidirectional(\n",
+ " tf.keras.layers.LSTM(256, return_sequences=True, return_state=True))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 75,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# extra code – this part of the model is exactly the same as earlier\n",
+ "encoder_outputs, *encoder_state = encoder(encoder_embeddings)\n",
+ "encoder_state = [tf.concat(encoder_state[::2], axis=-1), # short-term (0 & 2)\n",
+ " tf.concat(encoder_state[1::2], axis=-1)] # long-term (1 & 3)\n",
+ "decoder = tf.keras.layers.LSTM(512, return_sequences=True)\n",
+ "decoder_outputs = decoder(decoder_embeddings, initial_state=encoder_state)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "And finally, let's add the `Attention` layer and the output layer:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 76,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "attention_layer = tf.keras.layers.Attention()\n",
+ "attention_outputs = attention_layer([decoder_outputs, encoder_outputs])\n",
+ "output_layer = tf.keras.layers.Dense(vocab_size, activation=\"softmax\")\n",
+ "Y_proba = output_layer(attention_outputs)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**Warning**: the following cell will take a while to run (possibly a couple hours if you are not using a GPU)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 77,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Epoch 1/10\n",
+ "3125/3125 [==============================] - 597s 189ms/step - loss: 0.3074 - accuracy: 0.5469 - val_loss: 0.2106 - val_accuracy: 0.6487\n",
+ "Epoch 2/10\n",
+ "3125/3125 [==============================] - 585s 187ms/step - loss: 0.1902 - accuracy: 0.6789 - val_loss: 0.1865 - val_accuracy: 0.6830\n",
+ "Epoch 3/10\n",
+ "3125/3125 [==============================] - 585s 187ms/step - loss: 0.1659 - accuracy: 0.7123 - val_loss: 0.1759 - val_accuracy: 0.7005\n",
+ "Epoch 4/10\n",
+ "3125/3125 [==============================] - 584s 187ms/step - loss: 0.1493 - accuracy: 0.7359 - val_loss: 0.1728 - val_accuracy: 0.7060\n",
+ "Epoch 5/10\n",
+ "3125/3125 [==============================] - 582s 186ms/step - loss: 0.1358 - accuracy: 0.7548 - val_loss: 0.1724 - val_accuracy: 0.7084\n",
+ "Epoch 6/10\n",
+ "3125/3125 [==============================] - 583s 186ms/step - loss: 0.1245 - accuracy: 0.7712 - val_loss: 0.1738 - val_accuracy: 0.7103\n",
+ "Epoch 7/10\n",
+ "3125/3125 [==============================] - 582s 186ms/step - loss: 0.1148 - accuracy: 0.7863 - val_loss: 0.1770 - val_accuracy: 0.7111\n",
+ "Epoch 8/10\n",
+ "3125/3125 [==============================] - 582s 186ms/step - loss: 0.1064 - accuracy: 0.7992 - val_loss: 0.1806 - val_accuracy: 0.7110\n",
+ "Epoch 9/10\n",
+ "3125/3125 [==============================] - 582s 186ms/step - loss: 0.0991 - accuracy: 0.8101 - val_loss: 0.1862 - val_accuracy: 0.7088\n",
+ "Epoch 10/10\n",
+ "3125/3125 [==============================] - 581s 186ms/step - loss: 0.0929 - accuracy: 0.8205 - val_loss: 0.1903 - val_accuracy: 0.7077\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 77,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "model = tf.keras.Model(inputs=[encoder_inputs, decoder_inputs],\n",
+ " outputs=[Y_proba])\n",
+ "model.compile(loss=\"sparse_categorical_crossentropy\", optimizer=\"nadam\",\n",
+ " metrics=[\"accuracy\"])\n",
+ "model.fit((X_train, X_train_dec), Y_train, epochs=10,\n",
+ " validation_data=((X_valid, X_valid_dec), Y_valid))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 78,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'me gusta el fútbol y también ir a la playa'"
+ ]
+ },
+ "execution_count": 78,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "translate(\"I like soccer and also going to the beach\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 79,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Top first words: [(-0.26210824, 'me'), (-2.553061, 'prefiero'), (-3.2005944, 'yo')]\n",
+ "Top translations so far: [(-0.32478744, 'me gusta'), (-3.0608056, 'prefiero el'), (-3.1685317, 'me gustan')]\n",
+ "Top translations so far: [(-0.7464272, 'me gusta el'), (-2.4712462, 'me gusta fútbol'), (-2.9149299, 'me gusta al')]\n",
+ "Top translations so far: [(-1.0369574, 'me gusta el fútbol'), (-2.3301778, 'me gusta el el'), (-2.9658434, 'me gusta fútbol y')]\n",
+ "Top translations so far: [(-1.0404125, 'me gusta el fútbol y'), (-2.5983238, 'me gusta el el fútbol'), (-2.9736564, 'me gusta fútbol y también')]\n",
+ "Top translations so far: [(-1.0520902, 'me gusta el fútbol y también'), (-2.6003318, 'me gusta el el fútbol y'), (-3.128903, 'me gusta fútbol y también me')]\n",
+ "Top translations so far: [(-1.9568634, 'me gusta el fútbol y también ir'), (-2.6169589, 'me gusta el el fútbol y también'), (-2.6949644, 'me gusta el fútbol y también fuera')]\n",
+ "Top translations so far: [(-1.9676423, 'me gusta el fútbol y también ir a'), (-2.8482866, 'me gusta el fútbol y también fuera a'), (-3.7197533, 'me gusta el el fútbol y también ir')]\n",
+ "Top translations so far: [(-1.9692448, 'me gusta el fútbol y también ir a la'), (-2.8501132, 'me gusta el fútbol y también fuera a la'), (-3.7309551, 'me gusta el el fútbol y también ir a')]\n",
+ "Top translations so far: [(-1.9733216, 'me gusta el fútbol y también ir a la playa'), (-2.851697, 'me gusta el fútbol y también fuera a la playa'), (-3.7333717, 'me gusta el el fútbol y también ir a la')]\n",
+ "Top translations so far: [(-1.9737166, 'me gusta el fútbol y también ir a la playa endofseq'), (-2.8547554, 'me gusta el fútbol y también fuera a la playa endofseq'), (-3.737218, 'me gusta el el fútbol y también ir a la playa')]\n",
+ "Top translations so far: [(-1.9737166, 'me gusta el fútbol y también ir a la playa endofseq'), (-2.8547554, 'me gusta el fútbol y también fuera a la playa endofseq'), (-3.7375438, 'me gusta el el fútbol y también ir a la playa endofseq')]\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "'me gusta el fútbol y también ir a la playa'"
+ ]
+ },
+ "execution_count": 79,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "beam_search(\"I like soccer and also going to the beach\", beam_width=3,\n",
+ " verbose=True)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Attention Is All You Need: The Transformer Architecture\n",
+ "### Positional encodings"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 80,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "max_length = 50 # max length in the whole training set\n",
+ "embed_size = 128\n",
+ "tf.random.set_seed(42) # extra code – ensures reproducibility on CPU\n",
+ "pos_embed_layer = tf.keras.layers.Embedding(max_length, embed_size)\n",
+ "batch_max_len_enc = tf.shape(encoder_embeddings)[1]\n",
+ "encoder_in = encoder_embeddings + pos_embed_layer(tf.range(batch_max_len_enc))\n",
+ "batch_max_len_dec = tf.shape(decoder_embeddings)[1]\n",
+ "decoder_in = decoder_embeddings + pos_embed_layer(tf.range(batch_max_len_dec))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Alternatively, we can use fixed, non-trainable positional encodings:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 81,
"metadata": {},
"outputs": [],
"source": [
"class PositionalEncoding(tf.keras.layers.Layer):\n",
- " def __init__(self, max_steps, max_dims, dtype=tf.float32, **kwargs):\n",
+ " def __init__(self, max_length, embed_size, dtype=tf.float32, **kwargs):\n",
" super().__init__(dtype=dtype, **kwargs)\n",
- " if max_dims % 2 == 1: max_dims += 1 # max_dims must be even\n",
- " p, i = np.meshgrid(np.arange(max_steps), np.arange(max_dims // 2))\n",
- " pos_emb = np.empty((1, max_steps, max_dims))\n",
- " pos_emb[0, :, ::2] = np.sin(p / 10000**(2 * i / max_dims)).T\n",
- " pos_emb[0, :, 1::2] = np.cos(p / 10000**(2 * i / max_dims)).T\n",
- " self.positional_embedding = tf.constant(pos_emb.astype(self.dtype))\n",
+ " max_dims = (embed_size + 1) // 2 * 2 # round up to nearest even number\n",
+ " p, i = np.meshgrid(np.arange(max_length), 2 * np.arange(max_dims // 2))\n",
+ " pos_emb = np.empty((1, max_length, max_dims))\n",
+ " pos_emb[0, :, ::2] = np.sin(p / 10_000 ** (i / max_dims)).T\n",
+ " pos_emb[0, :, 1::2] = np.cos(p / 10_000 ** (i / max_dims)).T\n",
+ " self.pos_encodings = tf.constant(pos_emb.astype(self.dtype))\n",
+ " self.supports_masking = True\n",
+ "\n",
" def call(self, inputs):\n",
- " shape = tf.shape(inputs)\n",
- " return inputs + self.positional_embedding[:, :shape[-2], :shape[-1]]"
+ " batch_max_length = tf.shape(inputs)[1]\n",
+ " return inputs + self.pos_encodings[:, :batch_max_length]"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 82,
"metadata": {},
"outputs": [],
"source": [
- "max_steps = 201\n",
- "max_dims = 512\n",
- "pos_emb = PositionalEncoding(max_steps, max_dims)\n",
- "PE = pos_emb(np.zeros((1, max_steps, max_dims), np.float32))[0].numpy()"
+ "pos_embed_layer = PositionalEncoding(max_length, embed_size)\n",
+ "encoder_in = pos_embed_layer(encoder_embeddings)\n",
+ "decoder_in = pos_embed_layer(decoder_embeddings)"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 83,
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ "