diff --git a/16_nlp_with_rnns_and_attention.ipynb b/16_nlp_with_rnns_and_attention.ipynb index 901a069..9bbef1f 100644 --- a/16_nlp_with_rnns_and_attention.ipynb +++ b/16_nlp_with_rnns_and_attention.ipynb @@ -28,16 +28,6 @@ "" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# WORK IN PROGRESS\n", - "\n", - "\n", - "**I'm still working on updating this chapter to the 3rd edition. Please come back in a few weeks.**" - ] - }, { "cell_type": "markdown", "metadata": { @@ -59,7 +49,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": { "id": "TFSU3FCOpKzu" }, @@ -81,7 +71,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": { "id": "YqCwW7cMpKzw" }, @@ -103,7 +93,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": { "id": "0Piq5se2pKzx" }, @@ -125,7 +115,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": { "id": "8d4TH3NbpKzx" }, @@ -151,7 +141,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": { "id": "PQFH5Y9PpKzy" }, @@ -180,7 +170,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": { "id": "Ekxzo6pOpKzy" }, @@ -199,86 +189,42 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "This notebooks uses the TensorFlow Addons library, and the Transformers library. If you're running on Colab, then we need to install them now:" + "# Generating Shakespearean Text Using a Character RNN" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Creating the Training Dataset" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's download the Shakespeare data from Andrej Karpathy's [char-rnn project](https://github.com/karpathy/char-rnn/)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Downloading data from https://homl.info/shakespeare\n", + "1122304/1115394 [==============================] - 0s 0us/step\n", + "1130496/1115394 [==============================] - 0s 0us/step\n" + ] + } + ], "source": [ - "if \"google.colab\" in sys.modules:\n", - " %pip install -q -U tensorflow-addons\n", - " %pip install -q -U transformers" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "First, let's import a few common modules, ensure MatplotLib plots figures inline and prepare a function to save the figures." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Char-RNN" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Splitting a sequence into batches of shuffled windows" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For example, let's split the sequence 0 to 14 into windows of length 5, each shifted by 2 (e.g.,`[0, 1, 2, 3, 4]`, `[2, 3, 4, 5, 6]`, etc.), then shuffle them, and split them into inputs (the first 4 steps) and targets (the last 4 steps) (e.g., `[2, 3, 4, 5, 6]` would be split into `[[2, 3, 4, 5], [3, 4, 5, 6]]`), then create batches of 3 such input/target pairs:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "np.random.seed(42)\n", - "tf.random.set_seed(42)\n", + "import tensorflow as tf\n", "\n", - "n_steps = 5\n", - "dataset = tf.data.Dataset.from_tensor_slices(tf.range(15))\n", - "dataset = dataset.window(n_steps, shift=2, drop_remainder=True)\n", - "dataset = dataset.flat_map(lambda window: window.batch(n_steps))\n", - "dataset = dataset.shuffle(10).map(lambda window: (window[:-1], window[1:]))\n", - "dataset = dataset.batch(3).prefetch(1)\n", - "for index, (X_batch, Y_batch) in enumerate(dataset):\n", - " print(\"_\" * 20, \"Batch\", index, \"\\nX_batch\")\n", - " print(X_batch.numpy())\n", - " print(\"=\" * 5, \"\\nY_batch\")\n", - " print(Y_batch.numpy())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Loading the Data and Preparing the Dataset" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "shakespeare_url = \"https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt\"\n", + "shakespeare_url = \"https://homl.info/shakespeare\" # shortcut URL\n", "filepath = tf.keras.utils.get_file(\"shakespeare.txt\", shakespeare_url)\n", "with open(filepath) as f:\n", " shakespeare_text = f.read()" @@ -286,269 +232,440 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "First Citizen:\n", + "Before we proceed any further, hear me speak.\n", + "\n", + "All:\n", + "Speak, speak.\n" + ] + } + ], "source": [ - "print(shakespeare_text[:148])" + "# extra code – shows a short text sample\n", + "print(shakespeare_text[:80])" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "\"\\n !$&',-.3:;?abcdefghijklmnopqrstuvwxyz\"" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ + "# extra code – shows all 39 distinct characters (after converting to lower case)\n", "\"\".join(sorted(set(shakespeare_text.lower())))" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ - "tokenizer = tf.keras.preprocessing.text.Tokenizer(char_level=True)\n", - "tokenizer.fit_on_texts(shakespeare_text)" + "text_vec_layer = tf.keras.layers.TextVectorization(split=\"character\",\n", + " standardize=\"lower\")\n", + "text_vec_layer.adapt([shakespeare_text])\n", + "encoded = text_vec_layer([shakespeare_text])[0]" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ - "tokenizer.texts_to_sequences([\"First\"])" + "encoded -= 2 # drop tokens 0 (pad) and 1 (unknown), which we will not use\n", + "n_tokens = text_vec_layer.vocabulary_size() - 2 # number of distinct chars = 39\n", + "dataset_size = len(encoded) # total number of chars = 1,115,394" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "39" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "tokenizer.sequences_to_texts([[20, 6, 9, 8, 3]])" + "n_tokens" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "1115394" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "max_id = len(tokenizer.word_index) # number of distinct characters\n", - "dataset_size = tokenizer.document_count # total number of characters" + "dataset_size" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ - "[encoded] = np.array(tokenizer.texts_to_sequences([shakespeare_text])) - 1\n", - "train_size = dataset_size * 90 // 100\n", - "dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Note**: in previous versions of this code, we used `dataset.repeat()` now to make the dataset \"infinite\", and later in the notebook we set the `steps_per_epoch` argument when calling the `model.fit()` method. This was needed to work around some TensorFlow bugs. However, since these bugs have now been fixed, we can simplify the code: no need for `dataset.repeat()` or `steps_per_epoch` anymore." + "def to_dataset(sequence, length, shuffle=False, seed=None, batch_size=32):\n", + " ds = tf.data.Dataset.from_tensor_slices(sequence)\n", + " ds = ds.window(length + 1, shift=1, drop_remainder=True)\n", + " ds = ds.flat_map(lambda window_ds: window_ds.batch(length + 1))\n", + " if shuffle:\n", + " ds = ds.shuffle(100_000, seed=seed)\n", + " ds = ds.batch(batch_size)\n", + " return ds.map(lambda window: (window[:, :-1], window[:, 1:])).prefetch(1)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "[(,\n", + " )]" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "n_steps = 100\n", - "window_length = n_steps + 1 # target = input shifted 1 character ahead\n", - "dataset = dataset.window(window_length, shift=1, drop_remainder=True)" + "# extra code – a simple example using to_dataset()\n", + "# There's just one sample in this dataset: the input represents \"to b\" and the\n", + "# output represents \"o be\"\n", + "list(to_dataset(text_vec_layer([\"To be\"])[0], length=4))" ] }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dataset = dataset.flat_map(lambda window: window.batch(window_length))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "np.random.seed(42)\n", - "tf.random.set_seed(42)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "batch_size = 32\n", - "dataset = dataset.shuffle(10000).batch(batch_size)\n", - "dataset = dataset.map(lambda windows: (windows[:, :-1], windows[:, 1:]))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dataset = dataset.map(\n", - " lambda X_batch, Y_batch: (tf.one_hot(X_batch, depth=max_id), Y_batch))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "dataset = dataset.prefetch(1)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "for X_batch, Y_batch in dataset.take(1):\n", - " print(X_batch.shape, Y_batch.shape)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Creating and Training the Model" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Warning**: the following code may take up to 24 hours to run, depending on your hardware. If you use a GPU, it may take just 1 or 2 hours, or less." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Note**: the `GRU` class will only use the GPU (if you have one) when using the default values for the following arguments: `activation`, `recurrent_activation`, `recurrent_dropout`, `unroll`, `use_bias` and `reset_after`. This is why I commented out `recurrent_dropout=0.2` (compared to the book)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "model = tf.keras.Sequential([\n", - " tf.keras.layers.GRU(128, return_sequences=True, input_shape=[None, max_id],\n", - " #dropout=0.2, recurrent_dropout=0.2),\n", - " dropout=0.2),\n", - " tf.keras.layers.GRU(128, return_sequences=True,\n", - " #dropout=0.2, recurrent_dropout=0.2),\n", - " dropout=0.2),\n", - " tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(max_id,\n", - " activation=\"softmax\"))\n", - "])\n", - "model.compile(loss=\"sparse_categorical_crossentropy\", optimizer=\"adam\")\n", - "history = model.fit(dataset, epochs=10)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Using the Model to Generate Text" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def preprocess(texts):\n", - " X = np.array(tokenizer.texts_to_sequences(texts)) - 1\n", - " return tf.one_hot(X, max_id)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Warning**: the `predict_classes()` method is deprecated. Instead, we must use `model(X_new).argmax(axis=-1)`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "X_new = preprocess([\"How are yo\"])\n", - "#Y_pred = model.predict_classes(X_new)\n", - "Y_pred = model(X_new).argmax(axis=-1)\n", - "tokenizer.sequences_to_texts(Y_pred + 1)[0][-1] # 1st sentence, last char" - ] - }, - { - "cell_type": "code", - "execution_count": null, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ + "length = 100\n", "tf.random.set_seed(42)\n", - "\n", - "tf.random.categorical([[np.log(0.5), np.log(0.4), np.log(0.1)]], num_samples=40).numpy()" + "train_set = to_dataset(encoded[:1_000_000], length=length, shuffle=True,\n", + " seed=42)\n", + "valid_set = to_dataset(encoded[1_000_000:1_060_000], length=length)\n", + "test_set = to_dataset(encoded[1_060_000:], length=length)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Building and Training the Char-RNN Model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Warning**: the following code may one or two hours to run, depending on your GPU. Without a GPU, it may take over 24 hours. If you don't want to wait, just skip the next two code cells and run the code below to download a pretrained model." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Note**: the `GRU` class will only use cuDNN acceleration (assuming you have a GPU) when using the default values for the following arguments: `activation`, `recurrent_activation`, `recurrent_dropout`, `unroll`, `use_bias` and `reset_after`." ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 1/10\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:tensorflow:Assets written to: my_shakespeare_model/assets\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "31247/31247 [==============================] - 1407s 45ms/step - loss: 1.3873 - accuracy: 0.5754 - val_loss: 1.6155 - val_accuracy: 0.5333\n", + "Epoch 2/10\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:tensorflow:Assets written to: my_shakespeare_model/assets\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "31247/31247 [==============================] - 1376s 44ms/step - loss: 1.2921 - accuracy: 0.5973 - val_loss: 1.5881 - val_accuracy: 0.5401\n", + "Epoch 3/10\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:tensorflow:Assets written to: my_shakespeare_model/assets\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "31247/31247 [==============================] - 1379s 44ms/step - loss: 1.2743 - accuracy: 0.6015 - val_loss: 1.5885 - val_accuracy: 0.5407\n", + "Epoch 4/10\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:tensorflow:Assets written to: my_shakespeare_model/assets\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "31247/31247 [==============================] - 1381s 44ms/step - loss: 1.2654 - accuracy: 0.6031 - val_loss: 1.5701 - val_accuracy: 0.5418\n", + "Epoch 5/10\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:tensorflow:Assets written to: my_shakespeare_model/assets\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "31247/31247 [==============================] - 1379s 44ms/step - loss: 1.2594 - accuracy: 0.6045 - val_loss: 1.5674 - val_accuracy: 0.5450\n", + "Epoch 6/10\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:tensorflow:Assets written to: my_shakespeare_model/assets\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "31247/31247 [==============================] - 1386s 44ms/step - loss: 1.2545 - accuracy: 0.6058 - val_loss: 1.5587 - val_accuracy: 0.5492\n", + "Epoch 7/10\n", + "31247/31247 [==============================] - 1381s 44ms/step - loss: 1.2514 - accuracy: 0.6062 - val_loss: 1.5532 - val_accuracy: 0.5460\n", + "Epoch 8/10\n", + "31247/31247 [==============================] - 1381s 44ms/step - loss: 1.2485 - accuracy: 0.6067 - val_loss: 1.5522 - val_accuracy: 0.5479\n", + "Epoch 9/10\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:tensorflow:Assets written to: my_shakespeare_model/assets\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "31247/31247 [==============================] - 1382s 44ms/step - loss: 1.2460 - accuracy: 0.6073 - val_loss: 1.5521 - val_accuracy: 0.5497\n", + "Epoch 10/10\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:tensorflow:Assets written to: my_shakespeare_model/assets\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "31247/31247 [==============================] - 1385s 44ms/step - loss: 1.2436 - accuracy: 0.6080 - val_loss: 1.5477 - val_accuracy: 0.5513\n" + ] + } + ], + "source": [ + "tf.random.set_seed(42) # extra code – ensures reproducibility on CPU\n", + "model = tf.keras.Sequential([\n", + " tf.keras.layers.Embedding(input_dim=n_tokens, output_dim=16),\n", + " tf.keras.layers.GRU(128, return_sequences=True),\n", + " tf.keras.layers.Dense(n_tokens, activation=\"softmax\")\n", + "])\n", + "model.compile(loss=\"sparse_categorical_crossentropy\", optimizer=\"nadam\",\n", + " metrics=[\"accuracy\"])\n", + "model_ckpt = tf.keras.callbacks.ModelCheckpoint(\n", + " \"my_shakespeare_model\", monitor=\"val_accuracy\", save_best_only=True)\n", + "history = model.fit(train_set, validation_data=valid_set, epochs=10,\n", + " callbacks=[model_ckpt])" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "shakespeare_model = tf.keras.Sequential([\n", + " text_vec_layer,\n", + " tf.keras.layers.Lambda(lambda X: X - 2), # no or tokens\n", + " model\n", + "])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you don't want to wait for training to complete, I've pretrained a model for you. The following code will download it. Uncomment the last line if you want to use it instead of the model trained above." + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "# extra code – downloads a pretrained model\n", + "url = \"https://github.com/ageron/data/raw/main/shakespeare_model.tgz\"\n", + "path = tf.keras.utils.get_file(\"shakespeare_model.tgz\", url, extract=True)\n", + "model_path = Path(path).with_name(\"shakespeare_model\")\n", + "#shakespeare_model = tf.keras.models.load_model(model_path)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'e'" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "y_proba = shakespeare_model.predict([\"To be or not to b\"])[0, -1]\n", + "y_pred = tf.argmax(y_proba) # choose the most probable character ID\n", + "text_vec_layer.get_vocabulary()[y_pred + 2]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Generating Fake Shakespearean Text" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "log_probas = tf.math.log([[0.5, 0.4, 0.1]]) # probas = 50%, 40%, and 10%\n", + "tf.random.set_seed(42)\n", + "tf.random.categorical(log_probas, num_samples=8) # draw 8 samples" + ] + }, + { + "cell_type": "code", + "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "def next_char(text, temperature=1):\n", - " X_new = preprocess([text])\n", - " y_proba = model(X_new)[0, -1:, :]\n", + " y_proba = shakespeare_model.predict([text])[0, -1:]\n", " rescaled_logits = tf.math.log(y_proba) / temperature\n", - " char_id = tf.random.categorical(rescaled_logits, num_samples=1) + 1\n", - " return tokenizer.sequences_to_texts(char_id.numpy())[0]" + " char_id = tf.random.categorical(rescaled_logits, num_samples=1)[0, 0]\n", + " return text_vec_layer.get_vocabulary()[char_id + 2]" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 23, "metadata": {}, "outputs": [], "source": [ - "tf.random.set_seed(42)\n", - "\n", - "next_char(\"How are yo\", temperature=1)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def complete_text(text, n_chars=50, temperature=1):\n", + "def extend_text(text, n_chars=50, temperature=1):\n", " for _ in range(n_chars):\n", " text += next_char(text, temperature)\n", " return text" @@ -556,31 +673,68 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 24, "metadata": {}, "outputs": [], "source": [ - "tf.random.set_seed(42)\n", - "\n", - "print(complete_text(\"t\", temperature=0.2))" + "tf.random.set_seed(42) # extra code – ensures reproducibility on CPU" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 25, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "To be or not to be the duke\n", + "as it is a proper strange death,\n", + "and the\n" + ] + } + ], "source": [ - "print(complete_text(\"t\", temperature=1))" + "print(extend_text(\"To be or not to be\", temperature=0.01))" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 26, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "To be or not to behold?\n", + "\n", + "second push:\n", + "gremio, lord all, a sistermen,\n" + ] + } + ], "source": [ - "print(complete_text(\"t\", temperature=2))" + "print(extend_text(\"To be or not to be\", temperature=1))" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "To be or not to bef ,mt'&o3fpadm!$\n", + "wh!nse?bws3est--vgerdjw?c-y-ewznq\n" + ] + } + ], + "source": [ + "print(extend_text(\"To be or not to be\", temperature=100))" ] }, { @@ -592,79 +746,124 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 28, "metadata": {}, "outputs": [], "source": [ - "tf.random.set_seed(42)" + "def to_dataset_for_stateful_rnn(sequence, length):\n", + " ds = tf.data.Dataset.from_tensor_slices(sequence)\n", + " ds = ds.window(length + 1, shift=length, drop_remainder=True)\n", + " ds = ds.flat_map(lambda window: window.batch(length + 1)).batch(1)\n", + " return ds.map(lambda window: (window[:, :-1], window[:, 1:])).prefetch(1)\n", + "\n", + "stateful_train_set = to_dataset_for_stateful_rnn(encoded[:1_000_000], length)\n", + "stateful_valid_set = to_dataset_for_stateful_rnn(encoded[:1_000_000], length)\n", + "stateful_test_set = to_dataset_for_stateful_rnn(encoded[1_060_000:], length)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 29, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "[(,\n", + " ),\n", + " (,\n", + " ),\n", + " (,\n", + " )]" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size])\n", - "dataset = dataset.window(window_length, shift=n_steps, drop_remainder=True)\n", - "dataset = dataset.flat_map(lambda window: window.batch(window_length))\n", - "dataset = dataset.batch(1)\n", - "dataset = dataset.map(lambda windows: (windows[:, :-1], windows[:, 1:]))\n", - "dataset = dataset.map(\n", - " lambda X_batch, Y_batch: (tf.one_hot(X_batch, depth=max_id), Y_batch))\n", - "dataset = dataset.prefetch(1)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "batch_size = 32\n", - "encoded_parts = np.array_split(encoded[:train_size], batch_size)\n", - "datasets = []\n", - "for encoded_part in encoded_parts:\n", - " dataset = tf.data.Dataset.from_tensor_slices(encoded_part)\n", - " dataset = dataset.window(window_length, shift=n_steps, drop_remainder=True)\n", - " dataset = dataset.flat_map(lambda window: window.batch(window_length))\n", - " datasets.append(dataset)\n", - "dataset = tf.data.Dataset.zip(tuple(datasets)).map(lambda *windows: tf.stack(windows))\n", - "dataset = dataset.map(lambda windows: (windows[:, :-1], windows[:, 1:]))\n", - "dataset = dataset.map(\n", - " lambda X_batch, Y_batch: (tf.one_hot(X_batch, depth=max_id), Y_batch))\n", - "dataset = dataset.prefetch(1)" + "# extra code – simple example using to_dataset_for_stateful_rnn()\n", + "list(to_dataset_for_stateful_rnn(tf.range(10), 3))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "**Note**: once again, I commented out `recurrent_dropout=0.2` (compared to the book) so you can get GPU acceleration (if you have one)." + "If you'd like to have more than one window per batch, you can use the `to_batched_dataset_for_stateful_rnn()` function instead of `to_dataset_for_stateful_rnn()`:" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[(,\n", + " ),\n", + " (,\n", + " ),\n", + " (,\n", + " )]" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# extra code – shows one way to prepare a batched dataset for a stateful RNN\n", + "\n", + "import numpy as np\n", + "\n", + "def to_non_overlapping_windows(sequence, length):\n", + " ds = tf.data.Dataset.from_tensor_slices(sequence)\n", + " ds = ds.window(length + 1, shift=length, drop_remainder=True)\n", + " return ds.flat_map(lambda window: window.batch(length + 1))\n", + "\n", + "def to_batched_dataset_for_stateful_rnn(sequence, length, batch_size=32):\n", + " parts = np.array_split(sequence, batch_size)\n", + " datasets = tuple(to_non_overlapping_windows(part, length) for part in parts)\n", + " ds = tf.data.Dataset.zip(datasets).map(lambda *windows: tf.stack(windows))\n", + " return ds.map(lambda window: (window[:, :-1], window[:, 1:])).prefetch(1)\n", + "\n", + "list(to_batched_dataset_for_stateful_rnn(tf.range(20), length=3, batch_size=2))" + ] + }, + { + "cell_type": "code", + "execution_count": 31, "metadata": {}, "outputs": [], "source": [ + "tf.random.set_seed(42) # extra code – ensures reproducibility on CPU\n", "model = tf.keras.Sequential([\n", - " tf.keras.layers.GRU(128, return_sequences=True, stateful=True,\n", - " #dropout=0.2, recurrent_dropout=0.2,\n", - " dropout=0.2,\n", - " batch_input_shape=[batch_size, None, max_id]),\n", - " tf.keras.layers.GRU(128, return_sequences=True, stateful=True,\n", - " #dropout=0.2, recurrent_dropout=0.2),\n", - " dropout=0.2),\n", - " tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(max_id,\n", - " activation=\"softmax\"))\n", + " tf.keras.layers.Embedding(input_dim=n_tokens, output_dim=16,\n", + " batch_input_shape=[1, None]),\n", + " tf.keras.layers.GRU(128, return_sequences=True, stateful=True),\n", + " tf.keras.layers.Dense(n_tokens, activation=\"softmax\")\n", "])" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 32, "metadata": {}, "outputs": [], "source": [ @@ -675,33 +874,197 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 33, "metadata": {}, "outputs": [], "source": [ - "model.compile(loss=\"sparse_categorical_crossentropy\", optimizer=\"adam\")\n", - "history = model.fit(dataset, epochs=50,\n", - " callbacks=[ResetStatesCallback()])" + "# extra code – use a different directory to save the checkpoints\n", + "model_ckpt = tf.keras.callbacks.ModelCheckpoint(\n", + " \"my_stateful_shakespeare_model\",\n", + " monitor=\"val_accuracy\",\n", + " save_best_only=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "To use the model with different batch sizes, we need to create a stateless copy. We can get rid of dropout since it is only used during training:" + "**Warning**: the following cell will take a while to run (possibly an hour if you are not using a GPU)." ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:tensorflow:Assets written to: my_stateful_shakespeare_model/assets\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "9999/9999 [==============================] - 213s 21ms/step - loss: 1.8690 - accuracy: 0.4494 - val_loss: 1.7632 - val_accuracy: 0.4672\n", + "Epoch 2/10\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:tensorflow:Assets written to: my_stateful_shakespeare_model/assets\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "9999/9999 [==============================] - 211s 21ms/step - loss: 1.5635 - accuracy: 0.5284 - val_loss: 1.6334 - val_accuracy: 0.4994\n", + "Epoch 3/10\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:tensorflow:Assets written to: my_stateful_shakespeare_model/assets\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "9999/9999 [==============================] - 209s 21ms/step - loss: 1.4875 - accuracy: 0.5478 - val_loss: 1.5788 - val_accuracy: 0.5153\n", + "Epoch 4/10\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:tensorflow:Assets written to: my_stateful_shakespeare_model/assets\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "9999/9999 [==============================] - 208s 21ms/step - loss: 1.4483 - accuracy: 0.5579 - val_loss: 1.5471 - val_accuracy: 0.5236\n", + "Epoch 5/10\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:tensorflow:Assets written to: my_stateful_shakespeare_model/assets\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "9999/9999 [==============================] - 213s 21ms/step - loss: 1.4241 - accuracy: 0.5643 - val_loss: 1.5270 - val_accuracy: 0.5286\n", + "Epoch 6/10\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:tensorflow:Assets written to: my_stateful_shakespeare_model/assets\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "9999/9999 [==============================] - 215s 21ms/step - loss: 1.4074 - accuracy: 0.5686 - val_loss: 1.5109 - val_accuracy: 0.5338\n", + "Epoch 7/10\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:tensorflow:Assets written to: my_stateful_shakespeare_model/assets\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "9999/9999 [==============================] - 210s 21ms/step - loss: 1.3953 - accuracy: 0.5714 - val_loss: 1.5008 - val_accuracy: 0.5361\n", + "Epoch 8/10\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:tensorflow:Assets written to: my_stateful_shakespeare_model/assets\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "9999/9999 [==============================] - 212s 21ms/step - loss: 1.3863 - accuracy: 0.5737 - val_loss: 1.4938 - val_accuracy: 0.5381\n", + "Epoch 9/10\n", + "9999/9999 [==============================] - 207s 21ms/step - loss: 1.3790 - accuracy: 0.5757 - val_loss: 1.4890 - val_accuracy: 0.5380\n", + "Epoch 10/10\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:tensorflow:Assets written to: my_stateful_shakespeare_model/assets\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "9999/9999 [==============================] - 208s 21ms/step - loss: 1.3729 - accuracy: 0.5770 - val_loss: 1.4786 - val_accuracy: 0.5420\n" + ] + } + ], + "source": [ + "model.compile(loss=\"sparse_categorical_crossentropy\", optimizer=\"nadam\",\n", + " metrics=[\"accuracy\"])\n", + "history = model.fit(stateful_train_set, validation_data=stateful_valid_set,\n", + " epochs=10, callbacks=[ResetStatesCallback(), model_ckpt])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Extra Material: converting the stateful RNN to a stateless RNN and using it**" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To use the model with different batch sizes, we need to create a stateless copy:" + ] + }, + { + "cell_type": "code", + "execution_count": 35, "metadata": {}, "outputs": [], "source": [ "stateless_model = tf.keras.Sequential([\n", - " tf.keras.layers.GRU(128, return_sequences=True, input_shape=[None, max_id]),\n", + " tf.keras.layers.Embedding(input_dim=n_tokens, output_dim=16),\n", " tf.keras.layers.GRU(128, return_sequences=True),\n", - " tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(max_id,\n", - " activation=\"softmax\"))\n", + " tf.keras.layers.Dense(n_tokens, activation=\"softmax\")\n", "])" ] }, @@ -714,32 +1077,53 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 36, "metadata": {}, "outputs": [], "source": [ - "stateless_model.build(tf.TensorShape([None, None, max_id]))" + "stateless_model.build(tf.TensorShape([None, None]))" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 37, "metadata": {}, "outputs": [], "source": [ - "stateless_model.set_weights(model.get_weights())\n", - "model = stateless_model" + "stateless_model.set_weights(model.get_weights())" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 38, "metadata": {}, "outputs": [], + "source": [ + "shakespeare_model = tf.keras.Sequential([\n", + " text_vec_layer,\n", + " tf.keras.layers.Lambda(lambda X: X - 2), # no or tokens\n", + " stateless_model\n", + "])" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "to be or not to be so in the world and the strangeness\n", + "to see the wo\n" + ] + } + ], "source": [ "tf.random.set_seed(42)\n", "\n", - "print(complete_text(\"t\"))" + "print(extend_text(\"to be or not to be\", temperature=0.01))" ] }, { @@ -751,243 +1135,286 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[1mDownloading and preparing dataset 80.23 MiB (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to /home/ageron/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...\u001b[0m\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "055c0f544ac349d9a14da8f843651df0", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Dl Completed...: 0 url [00:00, ? url/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "e2abc244f4844d56919979b33cc2fa79", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Dl Size...: 0 MiB [00:00, ? MiB/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "af507eed124c4ff6900538205b1b00fd", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Generating splits...: 0%| | 0/3 [00:00
But come on Hollywood - a Moun ...\n", + "Label: 0\n", + "This is the kind of film for a snowy Sunday afternoon when the rest of the world can go ahead with its own business as you descend into a big arm-chair and mellow for a couple of hours. Wonderful perf ...\n", + "Label: 1\n" + ] + } + ], + "source": [ + "for review, label in raw_train_set.take(4):\n", + " print(review.numpy().decode(\"utf-8\")[:200], \"...\")\n", + " print(\"Label:\", label.numpy())" + ] + }, + { + "cell_type": "code", + "execution_count": 42, "metadata": {}, "outputs": [], "source": [ - "tf.random.set_seed(42)" + "vocab_size = 1000\n", + "text_vec_layer = tf.keras.layers.TextVectorization(max_tokens=vocab_size)\n", + "text_vec_layer.adapt(train_set.map(lambda reviews, labels: reviews))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "You can load the IMDB dataset easily:" + "**Warning**: the following cell will take a few minutes to run and the model will probably not learn anything because we didn't mask the padding tokens (that's the point of the next section)." ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 43, "metadata": {}, - "outputs": [], - "source": [ - "(X_train, y_train), (X_test, y_test) = tf.keras.datasets.imdb.load_data()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "X_train[0][:10]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "word_index = tf.keras.datasets.imdb.get_word_index()\n", - "id_to_word = {id_ + 3: word for word, id_ in word_index.items()}\n", - "for id_, token in enumerate((\"\", \"\", \"\")):\n", - " id_to_word[id_] = token\n", - "\" \".join([id_to_word[id_] for id_ in X_train[0][:10]])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import tensorflow_datasets as tfds\n", - "\n", - "datasets, info = tfds.load(\"imdb_reviews\", as_supervised=True, with_info=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "datasets.keys()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "train_size = info.splits[\"train\"].num_examples\n", - "test_size = info.splits[\"test\"].num_examples" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "train_size, test_size" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "for X_batch, y_batch in datasets[\"train\"].batch(2).take(1):\n", - " for review, label in zip(X_batch.numpy(), y_batch.numpy()):\n", - " print(\"Review:\", review.decode(\"utf-8\")[:200], \"...\")\n", - " print(\"Label:\", label, \"= Positive\" if label else \"= Negative\")\n", - " print()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def preprocess(X_batch, y_batch):\n", - " X_batch = tf.strings.substr(X_batch, 0, 300)\n", - " X_batch = tf.strings.regex_replace(X_batch, rb\"\", b\" \")\n", - " X_batch = tf.strings.regex_replace(X_batch, b\"[^a-zA-Z']\", b\" \")\n", - " X_batch = tf.strings.split(X_batch)\n", - " return X_batch.to_tensor(default_value=b\"\"), y_batch" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "preprocess(X_batch, y_batch)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from collections import Counter\n", - "\n", - "vocabulary = Counter()\n", - "for X_batch, y_batch in datasets[\"train\"].batch(32).map(preprocess):\n", - " for review in X_batch:\n", - " vocabulary.update(list(review.numpy()))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "vocabulary.most_common()[:3]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "len(vocabulary)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "vocab_size = 10000\n", - "truncated_vocabulary = [\n", - " word for word, count in vocabulary.most_common()[:vocab_size]]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "word_to_id = {word: index for index, word in enumerate(truncated_vocabulary)}\n", - "for word in b\"This movie was faaaaaantastic\".split():\n", - " print(word_to_id.get(word) or vocab_size)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "words = tf.constant(truncated_vocabulary)\n", - "word_ids = tf.range(len(truncated_vocabulary), dtype=tf.int64)\n", - "vocab_init = tf.lookup.KeyValueTensorInitializer(words, word_ids)\n", - "num_oov_buckets = 1000\n", - "table = tf.lookup.StaticVocabularyTable(vocab_init, num_oov_buckets)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "table.lookup(tf.constant([b\"This movie was faaaaaantastic\".split()]))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def encode_words(X_batch, y_batch):\n", - " return table.lookup(X_batch), y_batch\n", - "\n", - "train_set = datasets[\"train\"].batch(32).map(preprocess)\n", - "train_set = train_set.map(encode_words).prefetch(1)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "for X_batch, y_batch in train_set.take(1):\n", - " print(X_batch)\n", - " print(y_batch)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 1/2\n", + "704/704 [==============================] - 255s 359ms/step - loss: 0.6934 - accuracy: 0.4990 - val_loss: 0.6931 - val_accuracy: 0.5016\n", + "Epoch 2/2\n", + "704/704 [==============================] - 250s 355ms/step - loss: 0.6934 - accuracy: 0.5042 - val_loss: 0.6942 - val_accuracy: 0.5008\n" + ] + } + ], "source": [ "embed_size = 128\n", + "tf.random.set_seed(42)\n", "model = tf.keras.Sequential([\n", - " tf.keras.layers.Embedding(vocab_size + num_oov_buckets, embed_size,\n", - " mask_zero=True, # not shown in the book\n", - " input_shape=[None]),\n", - " tf.keras.layers.GRU(128, return_sequences=True),\n", + " text_vec_layer,\n", + " tf.keras.layers.Embedding(vocab_size, embed_size),\n", " tf.keras.layers.GRU(128),\n", " tf.keras.layers.Dense(1, activation=\"sigmoid\")\n", "])\n", - "model.compile(loss=\"binary_crossentropy\", optimizer=\"adam\", metrics=[\"accuracy\"])\n", - "history = model.fit(train_set, epochs=5)" + "model.compile(loss=\"binary_crossentropy\", optimizer=\"nadam\",\n", + " metrics=[\"accuracy\"])\n", + "history = model.fit(train_set, validation_data=valid_set, epochs=2)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Masking" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Warning**: the following cell will take a while to run (possibly 30 minutes if you are not using a GPU)." + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 1/5\n", + "704/704 [==============================] - 303s 426ms/step - loss: 0.5296 - accuracy: 0.7234 - val_loss: 0.4045 - val_accuracy: 0.8244\n", + "Epoch 2/5\n", + "704/704 [==============================] - 295s 419ms/step - loss: 0.3702 - accuracy: 0.8418 - val_loss: 0.3390 - val_accuracy: 0.8532\n", + "Epoch 3/5\n", + "704/704 [==============================] - 298s 423ms/step - loss: 0.3057 - accuracy: 0.8747 - val_loss: 0.3196 - val_accuracy: 0.8696\n", + "Epoch 4/5\n", + "704/704 [==============================] - 294s 418ms/step - loss: 0.2784 - accuracy: 0.8871 - val_loss: 0.3162 - val_accuracy: 0.8596\n", + "Epoch 5/5\n", + "704/704 [==============================] - 293s 417ms/step - loss: 0.2597 - accuracy: 0.8961 - val_loss: 0.3209 - val_accuracy: 0.8548\n" + ] + } + ], + "source": [ + "embed_size = 128\n", + "tf.random.set_seed(42)\n", + "model = tf.keras.Sequential([\n", + " text_vec_layer,\n", + " tf.keras.layers.Embedding(vocab_size, embed_size, mask_zero=True),\n", + " tf.keras.layers.GRU(128),\n", + " tf.keras.layers.Dense(1, activation=\"sigmoid\")\n", + "])\n", + "model.compile(loss=\"binary_crossentropy\", optimizer=\"nadam\",\n", + " metrics=[\"accuracy\"])\n", + "history = model.fit(train_set, validation_data=valid_set, epochs=5)" ] }, { @@ -999,405 +1426,1497 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 45, "metadata": {}, "outputs": [], "source": [ - "K = tf.keras.backend\n", - "embed_size = 128\n", - "inputs = tf.keras.layers.Input(shape=[None])\n", - "mask = tf.keras.layers.Lambda(lambda inputs: K.not_equal(inputs, 0))(inputs)\n", - "z = tf.keras.layers.Embedding(vocab_size + num_oov_buckets, embed_size)(inputs)\n", - "z = tf.keras.layers.GRU(128, return_sequences=True)(z, mask=mask)\n", - "z = tf.keras.layers.GRU(128)(z, mask=mask)\n", - "outputs = tf.keras.layers.Dense(1, activation=\"sigmoid\")(z)\n", - "model = tf.keras.Model(inputs=[inputs], outputs=[outputs])\n", - "model.compile(loss=\"binary_crossentropy\", optimizer=\"adam\", metrics=[\"accuracy\"])\n", - "history = model.fit(train_set, epochs=5)" + "tf.random.set_seed(42) # extra code – ensures reproducibility on the CPU\n", + "inputs = tf.keras.layers.Input(shape=[], dtype=tf.string)\n", + "token_ids = text_vec_layer(inputs)\n", + "mask = tf.math.not_equal(token_ids, 0)\n", + "Z = tf.keras.layers.Embedding(vocab_size, embed_size)(token_ids)\n", + "Z = tf.keras.layers.GRU(128, dropout=0.2)(Z, mask=mask)\n", + "outputs = tf.keras.layers.Dense(1, activation=\"sigmoid\")(Z)\n", + "model = tf.keras.Model(inputs=[inputs], outputs=[outputs])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Reusing Pretrained Embeddings" + "**Warning**: the following cell will take a while to run (possibly 30 minutes if you are not using a GPU)." ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 46, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 1/5\n", + "704/704 [==============================] - 303s 427ms/step - loss: 0.5447 - accuracy: 0.7198 - val_loss: 0.4604 - val_accuracy: 0.7720\n", + "Epoch 2/5\n", + "704/704 [==============================] - 301s 427ms/step - loss: 0.3469 - accuracy: 0.8512 - val_loss: 0.3214 - val_accuracy: 0.8608\n", + "Epoch 3/5\n", + "704/704 [==============================] - 295s 419ms/step - loss: 0.3054 - accuracy: 0.8713 - val_loss: 0.3069 - val_accuracy: 0.8672\n", + "Epoch 4/5\n", + "704/704 [==============================] - 295s 420ms/step - loss: 0.2798 - accuracy: 0.8828 - val_loss: 0.3028 - val_accuracy: 0.8672\n", + "Epoch 5/5\n", + "704/704 [==============================] - 298s 423ms/step - loss: 0.2622 - accuracy: 0.8920 - val_loss: 0.2953 - val_accuracy: 0.8700\n" + ] + } + ], "source": [ - "tf.random.set_seed(42)" + "# extra code – compiles and trains the model, as usual\n", + "model.compile(loss=\"binary_crossentropy\", optimizer=\"nadam\",\n", + " metrics=[\"accuracy\"])\n", + "history = model.fit(train_set, validation_data=valid_set, epochs=5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Extra material: using ragged tensors**" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 47, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 47, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "tfhub_cache_dir = Path() / \"my_tfhub_cache\"\n", - "os.environ[\"TFHUB_CACHE_DIR\"] = str(tfhub_cache_dir)" + "text_vec_layer_ragged = tf.keras.layers.TextVectorization(\n", + " max_tokens=vocab_size, ragged=True)\n", + "text_vec_layer_ragged.adapt(train_set.map(lambda reviews, labels: reviews))\n", + "text_vec_layer_ragged([\"Great movie!\", \"This is DiCaprio's best role.\"])" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 48, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "import tensorflow_hub as hub\n", - "\n", + "text_vec_layer([\"Great movie!\", \"This is DiCaprio's best role.\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Warning**: the following cell will take a while to run (possibly 30 minutes if you are not using a GPU)." + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 1/5\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "704/704 [==============================] - 280s 395ms/step - loss: 0.5038 - accuracy: 0.7496 - val_loss: 0.6706 - val_accuracy: 0.6752\n", + "Epoch 2/5\n", + "704/704 [==============================] - 277s 393ms/step - loss: 0.4499 - accuracy: 0.7892 - val_loss: 0.3494 - val_accuracy: 0.8500\n", + "Epoch 3/5\n", + "704/704 [==============================] - 276s 392ms/step - loss: 0.3270 - accuracy: 0.8592 - val_loss: 0.3855 - val_accuracy: 0.8260\n", + "Epoch 4/5\n", + "704/704 [==============================] - 277s 394ms/step - loss: 0.2935 - accuracy: 0.8760 - val_loss: 0.3401 - val_accuracy: 0.8520\n", + "Epoch 5/5\n", + "704/704 [==============================] - 275s 390ms/step - loss: 0.2742 - accuracy: 0.8854 - val_loss: 0.3971 - val_accuracy: 0.8208\n" + ] + } + ], + "source": [ + "embed_size = 128\n", + "tf.random.set_seed(42)\n", "model = tf.keras.Sequential([\n", - " hub.KerasLayer(\"https://tfhub.dev/google/tf2-preview/nnlm-en-dim50/1\",\n", - " dtype=tf.string, input_shape=[], output_shape=[50]),\n", - " tf.keras.layers.Dense(128, activation=\"relu\"),\n", + " text_vec_layer_ragged,\n", + " tf.keras.layers.Embedding(vocab_size, embed_size),\n", + " tf.keras.layers.GRU(128),\n", " tf.keras.layers.Dense(1, activation=\"sigmoid\")\n", "])\n", - "model.compile(loss=\"binary_crossentropy\", optimizer=\"adam\",\n", - " metrics=[\"accuracy\"])" + "model.compile(loss=\"binary_crossentropy\", optimizer=\"nadam\",\n", + " metrics=[\"accuracy\"])\n", + "history = model.fit(train_set, validation_data=valid_set, epochs=5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Let's define a `tree()` function to view the structure of the cache directory TF Hub just created:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def tree(path, level=0, indent=4):\n", - " if level == 0:\n", - " print(f\"{path}/\")\n", - " level += 1\n", - " sub_paths = sorted(path.iterdir())\n", - " sub_dirs = [sub_path for sub_path in sub_paths if sub_path.is_dir()]\n", - " filepaths = [sub_path for sub_path in sub_paths if not sub_path in sub_dirs]\n", - " indent_str = \" \" * indent * level\n", - " for sub_dir in sub_dirs:\n", - " print(f\"{indent_str}{sub_dir.name}/\")\n", - " tree(sub_dir, level + 1, indent)\n", - " for filepath in filepaths:\n", - " print(f\"{indent_str}{filepath.name}\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "tree(tfhub_cache_dir)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import tensorflow_datasets as tfds\n", - "\n", - "datasets, info = tfds.load(\"imdb_reviews\", as_supervised=True, with_info=True)\n", - "train_size = info.splits[\"train\"].num_examples\n", - "batch_size = 32\n", - "train_set = datasets[\"train\"].batch(batch_size).prefetch(1)\n", - "history = model.fit(train_set, epochs=5)" + "## Reusing Pretrained Embeddings and Language Models" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Automatic Translation" + "**Warning**: the following cell will take a while to run (possibly an hour if you are not using a GPU)." ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 50, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 1/10\n", + "704/704 [==============================] - 224s 303ms/step - loss: 0.3141 - accuracy: 0.8648 - val_loss: 0.2397 - val_accuracy: 0.9008\n", + "Epoch 2/10\n", + "704/704 [==============================] - 205s 291ms/step - loss: 0.0489 - accuracy: 0.9852 - val_loss: 0.3257 - val_accuracy: 0.8936\n", + "Epoch 3/10\n", + "704/704 [==============================] - 204s 290ms/step - loss: 0.0061 - accuracy: 0.9988 - val_loss: 0.3963 - val_accuracy: 0.8944\n", + "Epoch 4/10\n", + "704/704 [==============================] - 204s 290ms/step - loss: 9.4918e-04 - accuracy: 0.9999 - val_loss: 0.4291 - val_accuracy: 0.8924\n", + "Epoch 5/10\n", + "704/704 [==============================] - 203s 289ms/step - loss: 5.1920e-04 - accuracy: 1.0000 - val_loss: 0.4691 - val_accuracy: 0.8932\n", + "Epoch 6/10\n", + "704/704 [==============================] - 204s 289ms/step - loss: 5.0053e-04 - accuracy: 1.0000 - val_loss: 0.4687 - val_accuracy: 0.8912\n", + "Epoch 7/10\n", + "704/704 [==============================] - 208s 296ms/step - loss: 3.7360e-04 - accuracy: 1.0000 - val_loss: 0.5034 - val_accuracy: 0.8984\n", + "Epoch 8/10\n", + "704/704 [==============================] - 209s 297ms/step - loss: 2.3907e-05 - accuracy: 1.0000 - val_loss: 0.5773 - val_accuracy: 0.8924\n", + "Epoch 9/10\n", + "704/704 [==============================] - 204s 290ms/step - loss: 9.0970e-06 - accuracy: 1.0000 - val_loss: 0.6163 - val_accuracy: 0.8972\n", + "Epoch 10/10\n", + "704/704 [==============================] - 205s 291ms/step - loss: 5.2528e-06 - accuracy: 1.0000 - val_loss: 0.6455 - val_accuracy: 0.8956\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 50, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "tf.random.set_seed(42)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "vocab_size = 100\n", - "embed_size = 10" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import tensorflow_addons as tfa\n", + "import os\n", + "import tensorflow_hub as hub\n", "\n", - "encoder_inputs = tf.keras.layers.Input(shape=[None], dtype=np.int32)\n", - "decoder_inputs = tf.keras.layers.Input(shape=[None], dtype=np.int32)\n", - "sequence_lengths = tf.keras.layers.Input(shape=[], dtype=np.int32)\n", - "\n", - "embeddings = tf.keras.layers.Embedding(vocab_size, embed_size)\n", - "encoder_embeddings = embeddings(encoder_inputs)\n", - "decoder_embeddings = embeddings(decoder_inputs)\n", - "\n", - "encoder = tf.keras.layers.LSTM(512, return_state=True)\n", - "encoder_outputs, state_h, state_c = encoder(encoder_embeddings)\n", - "encoder_state = [state_h, state_c]\n", - "\n", - "sampler = tfa.seq2seq.sampler.TrainingSampler()\n", - "\n", - "decoder_cell = tf.keras.layers.LSTMCell(512)\n", - "output_layer = tf.keras.layers.Dense(vocab_size)\n", - "decoder = tfa.seq2seq.basic_decoder.BasicDecoder(decoder_cell, sampler,\n", - " output_layer=output_layer)\n", - "final_outputs, final_state, final_sequence_lengths = decoder(\n", - " decoder_embeddings, initial_state=encoder_state,\n", - " sequence_length=sequence_lengths)\n", - "Y_proba = tf.nn.softmax(final_outputs.rnn_output)\n", - "\n", - "model = tf.keras.Model(\n", - " inputs=[encoder_inputs, decoder_inputs, sequence_lengths],\n", - " outputs=[Y_proba])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "model.compile(loss=\"sparse_categorical_crossentropy\", optimizer=\"adam\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "X = np.random.randint(100, size=1000 * 10).reshape(1000, 10)\n", - "Y = np.random.randint(100, size=1000 * 15).reshape(1000, 15)\n", - "X_decoder = np.c_[np.zeros((1000, 1)), Y[:, :-1]]\n", - "seq_lengths = np.full([1000], 10)\n", - "\n", - "history = model.fit([X, X_decoder, seq_lengths], Y, epochs=2)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Bidirectional Recurrent Layers" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ + "os.environ[\"TFHUB_CACHE_DIR\"] = \"my_tfhub_cache\"\n", + "tf.random.set_seed(42) # extra code – ensures reproducibility on CPU\n", "model = tf.keras.Sequential([\n", - " tf.keras.layers.GRU(10, return_sequences=True, input_shape=[None, 10]),\n", - " tf.keras.layers.Bidirectional(tf.keras.layers.GRU(10, return_sequences=True))\n", + " hub.KerasLayer(\"https://tfhub.dev/google/universal-sentence-encoder/4\",\n", + " trainable=True, dtype=tf.string, input_shape=[]),\n", + " tf.keras.layers.Dense(64, activation=\"relu\"),\n", + " tf.keras.layers.Dense(1, activation=\"sigmoid\")\n", "])\n", - "\n", - "model.summary()" + "model.compile(loss=\"binary_crossentropy\", optimizer=\"nadam\",\n", + " metrics=[\"accuracy\"])\n", + "model.fit(train_set, validation_data=valid_set, epochs=10)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Positional Encoding" + "# An Encoder–Decoder Network for Neural Machine Translation" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 51, + "metadata": {}, + "outputs": [], + "source": [ + "url = \"https://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip\"\n", + "path = tf.keras.utils.get_file(\"spa-eng.zip\", origin=url, cache_dir=\"datasets\",\n", + " extract=True)\n", + "text = (Path(path).with_name(\"spa-eng\") / \"spa.txt\").read_text()" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "\n", + "text = text.replace(\"¡\", \"\").replace(\"¿\", \"\")\n", + "pairs = [line.split(\"\\t\") for line in text.splitlines()]\n", + "np.random.seed(42) # extra code – ensures reproducibility on CPU\n", + "np.random.shuffle(pairs)\n", + "sentences_en, sentences_es = zip(*pairs) # separates the pairs into 2 lists" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "How boring! => Qué aburrimiento!\n", + "I love sports. => Adoro el deporte.\n", + "Would you like to swap jobs? => Te gustaría que intercambiemos los trabajos?\n" + ] + } + ], + "source": [ + "for i in range(3):\n", + " print(sentences_en[i], \"=>\", sentences_es[i])" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [], + "source": [ + "vocab_size = 1000\n", + "max_length = 50\n", + "text_vec_layer_en = tf.keras.layers.TextVectorization(\n", + " vocab_size, output_sequence_length=max_length)\n", + "text_vec_layer_es = tf.keras.layers.TextVectorization(\n", + " vocab_size, output_sequence_length=max_length)\n", + "text_vec_layer_en.adapt(sentences_en)\n", + "text_vec_layer_es.adapt([f\"startofseq {s} endofseq\" for s in sentences_es])" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['', '[UNK]', 'the', 'i', 'to', 'you', 'tom', 'a', 'is', 'he']" + ] + }, + "execution_count": 55, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "text_vec_layer_en.get_vocabulary()[:10]" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['', '[UNK]', 'startofseq', 'endofseq', 'de', 'que', 'a', 'no', 'tom', 'la']" + ] + }, + "execution_count": 56, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "text_vec_layer_es.get_vocabulary()[:10]" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": {}, + "outputs": [], + "source": [ + "X_train = tf.constant(sentences_en[:100_000])\n", + "X_valid = tf.constant(sentences_en[100_000:])\n", + "X_train_dec = tf.constant([f\"startofseq {s}\" for s in sentences_es[:100_000]])\n", + "X_valid_dec = tf.constant([f\"startofseq {s}\" for s in sentences_es[100_000:]])\n", + "Y_train = text_vec_layer_es([f\"{s} endofseq\" for s in sentences_es[:100_000]])\n", + "Y_valid = text_vec_layer_es([f\"{s} endofseq\" for s in sentences_es[100_000:]])" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": {}, + "outputs": [], + "source": [ + "tf.random.set_seed(42) # extra code – ensures reproducibility on CPU\n", + "encoder_inputs = tf.keras.layers.Input(shape=[], dtype=tf.string)\n", + "decoder_inputs = tf.keras.layers.Input(shape=[], dtype=tf.string)" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": {}, + "outputs": [], + "source": [ + "embed_size = 128\n", + "encoder_input_ids = text_vec_layer_en(encoder_inputs)\n", + "decoder_input_ids = text_vec_layer_es(decoder_inputs)\n", + "encoder_embedding_layer = tf.keras.layers.Embedding(vocab_size, embed_size,\n", + " mask_zero=True)\n", + "decoder_embedding_layer = tf.keras.layers.Embedding(vocab_size, embed_size,\n", + " mask_zero=True)\n", + "encoder_embeddings = encoder_embedding_layer(encoder_input_ids)\n", + "decoder_embeddings = decoder_embedding_layer(decoder_input_ids)" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": {}, + "outputs": [], + "source": [ + "encoder = tf.keras.layers.LSTM(512, return_state=True)\n", + "encoder_outputs, *encoder_state = encoder(encoder_embeddings)" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": {}, + "outputs": [], + "source": [ + "decoder = tf.keras.layers.LSTM(512, return_sequences=True)\n", + "decoder_outputs = decoder(decoder_embeddings, initial_state=encoder_state)" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": {}, + "outputs": [], + "source": [ + "output_layer = tf.keras.layers.Dense(vocab_size, activation=\"softmax\")\n", + "Y_proba = output_layer(decoder_outputs)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Warning**: the following cell will take a while to run (possibly a couple hours if you are not using a GPU)." + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 1/10\n", + "3125/3125 [==============================] - 698s 221ms/step - loss: 0.4154 - accuracy: 0.4256 - val_loss: 0.3069 - val_accuracy: 0.5246\n", + "Epoch 2/10\n", + "3125/3125 [==============================] - 686s 219ms/step - loss: 0.2631 - accuracy: 0.5745 - val_loss: 0.2367 - val_accuracy: 0.6055\n", + "Epoch 3/10\n", + "3125/3125 [==============================] - 686s 220ms/step - loss: 0.2066 - accuracy: 0.6457 - val_loss: 0.2061 - val_accuracy: 0.6500\n", + "Epoch 4/10\n", + "3125/3125 [==============================] - 682s 218ms/step - loss: 0.1740 - accuracy: 0.6907 - val_loss: 0.1920 - val_accuracy: 0.6691\n", + "Epoch 5/10\n", + "3125/3125 [==============================] - 676s 216ms/step - loss: 0.1507 - accuracy: 0.7237 - val_loss: 0.1865 - val_accuracy: 0.6767\n", + "Epoch 6/10\n", + "3125/3125 [==============================] - 675s 216ms/step - loss: 0.1316 - accuracy: 0.7522 - val_loss: 0.1847 - val_accuracy: 0.6804\n", + "Epoch 7/10\n", + "3125/3125 [==============================] - 675s 216ms/step - loss: 0.1154 - accuracy: 0.7774 - val_loss: 0.1866 - val_accuracy: 0.6822\n", + "Epoch 8/10\n", + "3125/3125 [==============================] - 673s 215ms/step - loss: 0.1011 - accuracy: 0.8007 - val_loss: 0.1907 - val_accuracy: 0.6829\n", + "Epoch 9/10\n", + "3125/3125 [==============================] - 673s 215ms/step - loss: 0.0888 - accuracy: 0.8215 - val_loss: 0.1961 - val_accuracy: 0.6792\n", + "Epoch 10/10\n", + "3125/3125 [==============================] - 673s 215ms/step - loss: 0.0782 - accuracy: 0.8402 - val_loss: 0.2027 - val_accuracy: 0.6763\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 63, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model = tf.keras.Model(inputs=[encoder_inputs, decoder_inputs],\n", + " outputs=[Y_proba])\n", + "model.compile(loss=\"sparse_categorical_crossentropy\", optimizer=\"nadam\",\n", + " metrics=[\"accuracy\"])\n", + "model.fit((X_train, X_train_dec), Y_train, epochs=10,\n", + " validation_data=((X_valid, X_valid_dec), Y_valid))" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": {}, + "outputs": [], + "source": [ + "def translate(sentence_en):\n", + " translation = \"\"\n", + " for word_idx in range(max_length):\n", + " X = np.array([sentence_en]) # encoder input \n", + " X_dec = np.array([\"startofseq \" + translation]) # decoder input\n", + " y_proba = model.predict((X, X_dec))[0, word_idx] # last token's probas\n", + " predicted_word_id = np.argmax(y_proba)\n", + " predicted_word = text_vec_layer_es.get_vocabulary()[predicted_word_id]\n", + " if predicted_word == \"endofseq\":\n", + " break\n", + " translation += \" \" + predicted_word\n", + " return translation.strip()" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'me gusta el fútbol'" + ] + }, + "execution_count": 65, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "translate(\"I like soccer\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Nice! However, the model struggles with longer sentences:" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'me gusta el fútbol y a veces mismo al bus'" + ] + }, + "execution_count": 66, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "translate(\"I like soccer and also going to the beach\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Bidirectional RNNs" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To create a bidirectional recurrent layer, just wrap a regular recurrent layer in a `Bidirectional` layer:" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "metadata": {}, + "outputs": [], + "source": [ + "tf.random.set_seed(42) # extra code – ensures reproducibility on CPU\n", + "encoder = tf.keras.layers.Bidirectional(\n", + " tf.keras.layers.LSTM(256, return_state=True))" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "metadata": {}, + "outputs": [], + "source": [ + "encoder_outputs, *encoder_state = encoder(encoder_embeddings)\n", + "encoder_state = [tf.concat(encoder_state[::2], axis=-1), # short-term (0 & 2)\n", + " tf.concat(encoder_state[1::2], axis=-1)] # long-term (1 & 3)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Warning**: the following cell will take a while to run (possibly a couple hours if you are not using a GPU)." + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 1/10\n", + "3125/3125 [==============================] - 574s 181ms/step - loss: 0.3075 - accuracy: 0.5393 - val_loss: 0.2192 - val_accuracy: 0.6319\n", + "Epoch 2/10\n", + "3125/3125 [==============================] - 564s 180ms/step - loss: 0.1916 - accuracy: 0.6689 - val_loss: 0.1880 - val_accuracy: 0.6731\n", + "Epoch 3/10\n", + "3125/3125 [==============================] - 566s 181ms/step - loss: 0.1602 - accuracy: 0.7119 - val_loss: 0.1751 - val_accuracy: 0.6916\n", + "Epoch 4/10\n", + "3125/3125 [==============================] - 566s 181ms/step - loss: 0.1395 - accuracy: 0.7415 - val_loss: 0.1715 - val_accuracy: 0.6979\n", + "Epoch 5/10\n", + "3125/3125 [==============================] - 566s 181ms/step - loss: 0.1227 - accuracy: 0.7666 - val_loss: 0.1707 - val_accuracy: 0.7025\n", + "Epoch 6/10\n", + "3125/3125 [==============================] - 567s 181ms/step - loss: 0.1085 - accuracy: 0.7887 - val_loss: 0.1730 - val_accuracy: 0.6995\n", + "Epoch 7/10\n", + "3125/3125 [==============================] - 571s 183ms/step - loss: 0.0961 - accuracy: 0.8089 - val_loss: 0.1764 - val_accuracy: 0.7000\n", + "Epoch 8/10\n", + "3125/3125 [==============================] - 567s 181ms/step - loss: 0.0852 - accuracy: 0.8273 - val_loss: 0.1821 - val_accuracy: 0.6981\n", + "Epoch 9/10\n", + "3125/3125 [==============================] - 565s 181ms/step - loss: 0.0759 - accuracy: 0.8438 - val_loss: 0.1881 - val_accuracy: 0.6956\n", + "Epoch 10/10\n", + "3125/3125 [==============================] - 565s 181ms/step - loss: 0.0682 - accuracy: 0.8577 - val_loss: 0.1951 - val_accuracy: 0.6906\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 69, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# extra code — completes the model and trains it\n", + "decoder = tf.keras.layers.LSTM(512, return_sequences=True)\n", + "decoder_outputs = decoder(decoder_embeddings, initial_state=encoder_state)\n", + "output_layer = tf.keras.layers.Dense(vocab_size, activation=\"softmax\")\n", + "Y_proba = output_layer(decoder_outputs)\n", + "model = tf.keras.Model(inputs=[encoder_inputs, decoder_inputs],\n", + " outputs=[Y_proba])\n", + "model.compile(loss=\"sparse_categorical_crossentropy\", optimizer=\"nadam\",\n", + " metrics=[\"accuracy\"])\n", + "model.fit((X_train, X_train_dec), Y_train, epochs=10,\n", + " validation_data=((X_valid, X_valid_dec), Y_valid))" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'me gusta el fútbol'" + ] + }, + "execution_count": 70, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "translate(\"I like soccer\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Beam Search" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This is a very basic implementation of beam search. I tried to make it readable and understandable, but it's definitely not optimized for speed! The function first uses the model to find the top _k_ words to start the translations (where _k_ is the beam width). For each of the top _k_ translations, it evaluates the conditional probabilities of all possible words it could add to that translation. These extended translations and their probabilities are added to the list of candidates. Once we've gone through all top _k_ translations and all words that could complete them, we keep only the top _k_ candidates with the highest probability, and we iterate over and over until they all finish with an EOS token. The top translation is then returned (after removing its EOS token).\n", + "\n", + "* Note: If p(S) is the probability of sentence S, and p(W|S) is the conditional probability of the word W given that the translation starts with S, then the probability of the sentence S' = concat(S, W) is p(S') = p(S) * p(W|S). As we add more words, the probability gets smaller and smaller. To avoid the risk of it getting too small, which could cause floating point precision errors, the function keeps track of log probabilities instead of probabilities: recall that log(a\\*b) = log(a) + log(b), therefore log(p(S')) = log(p(S)) + log(p(W|S))." + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "metadata": {}, + "outputs": [], + "source": [ + "# extra code – a basic implementation of beam search\n", + "\n", + "def beam_search(sentence_en, beam_width, verbose=False):\n", + " X = np.array([sentence_en]) # encoder input\n", + " X_dec = np.array([\"startofseq\"]) # decoder input\n", + " y_proba = model.predict((X, X_dec))[0, 0] # first token's probas\n", + " top_k = tf.math.top_k(y_proba, k=beam_width)\n", + " top_translations = [ # list of best (log_proba, translation)\n", + " (np.log(word_proba), text_vec_layer_es.get_vocabulary()[word_id])\n", + " for word_proba, word_id in zip(top_k.values, top_k.indices)\n", + " ]\n", + " \n", + " # extra code – displays the top first words in verbose mode\n", + " if verbose:\n", + " print(\"Top first words:\", top_translations)\n", + "\n", + " for idx in range(1, max_length):\n", + " candidates = []\n", + " for log_proba, translation in top_translations:\n", + " if translation.endswith(\"endofseq\"):\n", + " candidates.append((log_proba, translation))\n", + " continue # translation is finished, so don't try to extend it\n", + " X = np.array([sentence_en]) # encoder input\n", + " X_dec = np.array([\"startofseq \" + translation]) # decoder input\n", + " y_proba = model.predict((X, X_dec))[0, idx] # last token's proba\n", + " for word_id, word_proba in enumerate(y_proba):\n", + " word = text_vec_layer_es.get_vocabulary()[word_id]\n", + " candidates.append((log_proba + np.log(word_proba),\n", + " f\"{translation} {word}\"))\n", + " top_translations = sorted(candidates, reverse=True)[:beam_width]\n", + "\n", + " # extra code – displays the top translation so far in verbose mode\n", + " if verbose:\n", + " print(\"Top translations so far:\", top_translations)\n", + "\n", + " if all([tr.endswith(\"endofseq\") for _, tr in top_translations]):\n", + " return top_translations[0][1].replace(\"endofseq\", \"\").strip()" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'me [UNK] los gatos y los gatos'" + ] + }, + "execution_count": 72, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# extra code – shows how the model making an error\n", + "sentence_en = \"I love cats and dogs\"\n", + "translate(sentence_en)" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Top first words: [(-0.012974381, 'me'), (-4.592527, '[UNK]'), (-6.314033, 'yo')]\n", + "Top translations so far: [(-0.4831518, 'me [UNK]'), (-1.4920667, 'me encanta'), (-1.986235, 'me gustan')]\n", + "Top translations so far: [(-0.6793061, 'me [UNK] los'), (-1.9889652, 'me gustan los'), (-2.0470557, 'me encanta los')]\n", + "Top translations so far: [(-0.7609749, 'me [UNK] los gatos'), (-2.0677316, 'me gustan los gatos'), (-2.26029, 'me encanta los gatos')]\n", + "Top translations so far: [(-0.76985043, 'me [UNK] los gatos y'), (-2.0701222, 'me gustan los gatos y'), (-2.2649746, 'me encanta los gatos y')]\n", + "Top translations so far: [(-0.81283045, 'me [UNK] los gatos y los'), (-2.118244, 'me gustan los gatos y los'), (-2.96167, 'me encanta los gatos y los')]\n", + "Top translations so far: [(-1.2259341, 'me [UNK] los gatos y los gatos'), (-1.9556838, 'me [UNK] los gatos y los perros'), (-2.7524388, 'me gustan los gatos y los perros')]\n", + "Top translations so far: [(-1.2261332, 'me [UNK] los gatos y los gatos endofseq'), (-1.9560521, 'me [UNK] los gatos y los perros endofseq'), (-2.7566314, 'me gustan los gatos y los perros endofseq')]\n" + ] + }, + { + "data": { + "text/plain": [ + "'me [UNK] los gatos y los gatos'" + ] + }, + "execution_count": 73, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# extra code – shows how beam search can help\n", + "beam_search(sentence_en, beam_width=3, verbose=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The correct translation is in the top 3 sentences found by beam search, but it's not the first. Since we're using a small vocabulary, the \\[UNK] token is quite frequent, so you may want to penalize it (e.g., divide its probability by 2 in the beam search function): this will discourage beam search from using it too much." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Attention Mechanisms" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We need to feed all the encoder's outputs to the `Attention` layer, so we must add `return_sequences=True` to the encoder:" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "metadata": {}, + "outputs": [], + "source": [ + "tf.random.set_seed(42) # extra code – ensures reproducibility on CPU\n", + "encoder = tf.keras.layers.Bidirectional(\n", + " tf.keras.layers.LSTM(256, return_sequences=True, return_state=True))" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "metadata": {}, + "outputs": [], + "source": [ + "# extra code – this part of the model is exactly the same as earlier\n", + "encoder_outputs, *encoder_state = encoder(encoder_embeddings)\n", + "encoder_state = [tf.concat(encoder_state[::2], axis=-1), # short-term (0 & 2)\n", + " tf.concat(encoder_state[1::2], axis=-1)] # long-term (1 & 3)\n", + "decoder = tf.keras.layers.LSTM(512, return_sequences=True)\n", + "decoder_outputs = decoder(decoder_embeddings, initial_state=encoder_state)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And finally, let's add the `Attention` layer and the output layer:" + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "metadata": {}, + "outputs": [], + "source": [ + "attention_layer = tf.keras.layers.Attention()\n", + "attention_outputs = attention_layer([decoder_outputs, encoder_outputs])\n", + "output_layer = tf.keras.layers.Dense(vocab_size, activation=\"softmax\")\n", + "Y_proba = output_layer(attention_outputs)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Warning**: the following cell will take a while to run (possibly a couple hours if you are not using a GPU)." + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 1/10\n", + "3125/3125 [==============================] - 597s 189ms/step - loss: 0.3074 - accuracy: 0.5469 - val_loss: 0.2106 - val_accuracy: 0.6487\n", + "Epoch 2/10\n", + "3125/3125 [==============================] - 585s 187ms/step - loss: 0.1902 - accuracy: 0.6789 - val_loss: 0.1865 - val_accuracy: 0.6830\n", + "Epoch 3/10\n", + "3125/3125 [==============================] - 585s 187ms/step - loss: 0.1659 - accuracy: 0.7123 - val_loss: 0.1759 - val_accuracy: 0.7005\n", + "Epoch 4/10\n", + "3125/3125 [==============================] - 584s 187ms/step - loss: 0.1493 - accuracy: 0.7359 - val_loss: 0.1728 - val_accuracy: 0.7060\n", + "Epoch 5/10\n", + "3125/3125 [==============================] - 582s 186ms/step - loss: 0.1358 - accuracy: 0.7548 - val_loss: 0.1724 - val_accuracy: 0.7084\n", + "Epoch 6/10\n", + "3125/3125 [==============================] - 583s 186ms/step - loss: 0.1245 - accuracy: 0.7712 - val_loss: 0.1738 - val_accuracy: 0.7103\n", + "Epoch 7/10\n", + "3125/3125 [==============================] - 582s 186ms/step - loss: 0.1148 - accuracy: 0.7863 - val_loss: 0.1770 - val_accuracy: 0.7111\n", + "Epoch 8/10\n", + "3125/3125 [==============================] - 582s 186ms/step - loss: 0.1064 - accuracy: 0.7992 - val_loss: 0.1806 - val_accuracy: 0.7110\n", + "Epoch 9/10\n", + "3125/3125 [==============================] - 582s 186ms/step - loss: 0.0991 - accuracy: 0.8101 - val_loss: 0.1862 - val_accuracy: 0.7088\n", + "Epoch 10/10\n", + "3125/3125 [==============================] - 581s 186ms/step - loss: 0.0929 - accuracy: 0.8205 - val_loss: 0.1903 - val_accuracy: 0.7077\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 77, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model = tf.keras.Model(inputs=[encoder_inputs, decoder_inputs],\n", + " outputs=[Y_proba])\n", + "model.compile(loss=\"sparse_categorical_crossentropy\", optimizer=\"nadam\",\n", + " metrics=[\"accuracy\"])\n", + "model.fit((X_train, X_train_dec), Y_train, epochs=10,\n", + " validation_data=((X_valid, X_valid_dec), Y_valid))" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'me gusta el fútbol y también ir a la playa'" + ] + }, + "execution_count": 78, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "translate(\"I like soccer and also going to the beach\")" + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Top first words: [(-0.26210824, 'me'), (-2.553061, 'prefiero'), (-3.2005944, 'yo')]\n", + "Top translations so far: [(-0.32478744, 'me gusta'), (-3.0608056, 'prefiero el'), (-3.1685317, 'me gustan')]\n", + "Top translations so far: [(-0.7464272, 'me gusta el'), (-2.4712462, 'me gusta fútbol'), (-2.9149299, 'me gusta al')]\n", + "Top translations so far: [(-1.0369574, 'me gusta el fútbol'), (-2.3301778, 'me gusta el el'), (-2.9658434, 'me gusta fútbol y')]\n", + "Top translations so far: [(-1.0404125, 'me gusta el fútbol y'), (-2.5983238, 'me gusta el el fútbol'), (-2.9736564, 'me gusta fútbol y también')]\n", + "Top translations so far: [(-1.0520902, 'me gusta el fútbol y también'), (-2.6003318, 'me gusta el el fútbol y'), (-3.128903, 'me gusta fútbol y también me')]\n", + "Top translations so far: [(-1.9568634, 'me gusta el fútbol y también ir'), (-2.6169589, 'me gusta el el fútbol y también'), (-2.6949644, 'me gusta el fútbol y también fuera')]\n", + "Top translations so far: [(-1.9676423, 'me gusta el fútbol y también ir a'), (-2.8482866, 'me gusta el fútbol y también fuera a'), (-3.7197533, 'me gusta el el fútbol y también ir')]\n", + "Top translations so far: [(-1.9692448, 'me gusta el fútbol y también ir a la'), (-2.8501132, 'me gusta el fútbol y también fuera a la'), (-3.7309551, 'me gusta el el fútbol y también ir a')]\n", + "Top translations so far: [(-1.9733216, 'me gusta el fútbol y también ir a la playa'), (-2.851697, 'me gusta el fútbol y también fuera a la playa'), (-3.7333717, 'me gusta el el fútbol y también ir a la')]\n", + "Top translations so far: [(-1.9737166, 'me gusta el fútbol y también ir a la playa endofseq'), (-2.8547554, 'me gusta el fútbol y también fuera a la playa endofseq'), (-3.737218, 'me gusta el el fútbol y también ir a la playa')]\n", + "Top translations so far: [(-1.9737166, 'me gusta el fútbol y también ir a la playa endofseq'), (-2.8547554, 'me gusta el fútbol y también fuera a la playa endofseq'), (-3.7375438, 'me gusta el el fútbol y también ir a la playa endofseq')]\n" + ] + }, + { + "data": { + "text/plain": [ + "'me gusta el fútbol y también ir a la playa'" + ] + }, + "execution_count": 79, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "beam_search(\"I like soccer and also going to the beach\", beam_width=3,\n", + " verbose=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Attention Is All You Need: The Transformer Architecture\n", + "### Positional encodings" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "metadata": {}, + "outputs": [], + "source": [ + "max_length = 50 # max length in the whole training set\n", + "embed_size = 128\n", + "tf.random.set_seed(42) # extra code – ensures reproducibility on CPU\n", + "pos_embed_layer = tf.keras.layers.Embedding(max_length, embed_size)\n", + "batch_max_len_enc = tf.shape(encoder_embeddings)[1]\n", + "encoder_in = encoder_embeddings + pos_embed_layer(tf.range(batch_max_len_enc))\n", + "batch_max_len_dec = tf.shape(decoder_embeddings)[1]\n", + "decoder_in = decoder_embeddings + pos_embed_layer(tf.range(batch_max_len_dec))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Alternatively, we can use fixed, non-trainable positional encodings:" + ] + }, + { + "cell_type": "code", + "execution_count": 81, "metadata": {}, "outputs": [], "source": [ "class PositionalEncoding(tf.keras.layers.Layer):\n", - " def __init__(self, max_steps, max_dims, dtype=tf.float32, **kwargs):\n", + " def __init__(self, max_length, embed_size, dtype=tf.float32, **kwargs):\n", " super().__init__(dtype=dtype, **kwargs)\n", - " if max_dims % 2 == 1: max_dims += 1 # max_dims must be even\n", - " p, i = np.meshgrid(np.arange(max_steps), np.arange(max_dims // 2))\n", - " pos_emb = np.empty((1, max_steps, max_dims))\n", - " pos_emb[0, :, ::2] = np.sin(p / 10000**(2 * i / max_dims)).T\n", - " pos_emb[0, :, 1::2] = np.cos(p / 10000**(2 * i / max_dims)).T\n", - " self.positional_embedding = tf.constant(pos_emb.astype(self.dtype))\n", + " max_dims = (embed_size + 1) // 2 * 2 # round up to nearest even number\n", + " p, i = np.meshgrid(np.arange(max_length), 2 * np.arange(max_dims // 2))\n", + " pos_emb = np.empty((1, max_length, max_dims))\n", + " pos_emb[0, :, ::2] = np.sin(p / 10_000 ** (i / max_dims)).T\n", + " pos_emb[0, :, 1::2] = np.cos(p / 10_000 ** (i / max_dims)).T\n", + " self.pos_encodings = tf.constant(pos_emb.astype(self.dtype))\n", + " self.supports_masking = True\n", + "\n", " def call(self, inputs):\n", - " shape = tf.shape(inputs)\n", - " return inputs + self.positional_embedding[:, :shape[-2], :shape[-1]]" + " batch_max_length = tf.shape(inputs)[1]\n", + " return inputs + self.pos_encodings[:, :batch_max_length]" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 82, "metadata": {}, "outputs": [], "source": [ - "max_steps = 201\n", - "max_dims = 512\n", - "pos_emb = PositionalEncoding(max_steps, max_dims)\n", - "PE = pos_emb(np.zeros((1, max_steps, max_dims), np.float32))[0].numpy()" + "pos_embed_layer = PositionalEncoding(max_length, embed_size)\n", + "encoder_in = pos_embed_layer(encoder_embeddings)\n", + "decoder_in = pos_embed_layer(decoder_embeddings)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 83, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], "source": [ + "# extra code – this cells generates and saves Figure 16–9\n", + "figure_max_length = 201\n", + "figure_embed_size = 512\n", + "pos_emb = PositionalEncoding(figure_max_length, figure_embed_size)\n", + "zeros = np.zeros((1, figure_max_length, figure_embed_size), np.float32)\n", + "P = pos_emb(zeros)[0].numpy()\n", "i1, i2, crop_i = 100, 101, 150\n", "p1, p2, p3 = 22, 60, 35\n", "fig, (ax1, ax2) = plt.subplots(nrows=2, ncols=1, sharex=True, figsize=(9, 5))\n", "ax1.plot([p1, p1], [-1, 1], \"k--\", label=\"$p = {}$\".format(p1))\n", "ax1.plot([p2, p2], [-1, 1], \"k--\", label=\"$p = {}$\".format(p2), alpha=0.5)\n", - "ax1.plot(p3, PE[p3, i1], \"bx\", label=\"$p = {}$\".format(p3))\n", - "ax1.plot(PE[:,i1], \"b-\", label=\"$i = {}$\".format(i1))\n", - "ax1.plot(PE[:,i2], \"r-\", label=\"$i = {}$\".format(i2))\n", - "ax1.plot([p1, p2], [PE[p1, i1], PE[p2, i1]], \"bo\")\n", - "ax1.plot([p1, p2], [PE[p1, i2], PE[p2, i2]], \"ro\")\n", + "ax1.plot(p3, P[p3, i1], \"bx\", label=\"$p = {}$\".format(p3))\n", + "ax1.plot(P[:,i1], \"b-\", label=\"$i = {}$\".format(i1))\n", + "ax1.plot(P[:,i2], \"r-\", label=\"$i = {}$\".format(i2))\n", + "ax1.plot([p1, p2], [P[p1, i1], P[p2, i1]], \"bo\")\n", + "ax1.plot([p1, p2], [P[p1, i2], P[p2, i2]], \"ro\")\n", "ax1.legend(loc=\"center right\", fontsize=14, framealpha=0.95)\n", "ax1.set_ylabel(\"$P_{(p,i)}$\", rotation=0, fontsize=16)\n", "ax1.grid(True, alpha=0.3)\n", - "ax1.hlines(0, 0, max_steps - 1, color=\"k\", linewidth=1, alpha=0.3)\n", - "ax1.axis([0, max_steps - 1, -1, 1])\n", - "ax2.imshow(PE.T[:crop_i], cmap=\"gray\", interpolation=\"bilinear\", aspect=\"auto\")\n", - "ax2.hlines(i1, 0, max_steps - 1, color=\"b\")\n", - "cheat = 2 # need to raise the red line a bit, or else it hides the blue one\n", - "ax2.hlines(i2+cheat, 0, max_steps - 1, color=\"r\")\n", + "ax1.hlines(0, 0, figure_max_length - 1, color=\"k\", linewidth=1, alpha=0.3)\n", + "ax1.axis([0, figure_max_length - 1, -1, 1])\n", + "ax2.imshow(P.T[:crop_i], cmap=\"gray\", interpolation=\"bilinear\", aspect=\"auto\")\n", + "ax2.hlines(i1, 0, figure_max_length - 1, color=\"b\", linewidth=3)\n", + "cheat = 2 # need to raise the red line a bit, or else it hides the blue one\n", + "ax2.hlines(i2+cheat, 0, figure_max_length - 1, color=\"r\", linewidth=3)\n", "ax2.plot([p1, p1], [0, crop_i], \"k--\")\n", "ax2.plot([p2, p2], [0, crop_i], \"k--\", alpha=0.5)\n", "ax2.plot([p1, p2], [i2+cheat, i2+cheat], \"ro\")\n", "ax2.plot([p1, p2], [i1, i1], \"bo\")\n", - "ax2.axis([0, max_steps - 1, 0, crop_i])\n", + "ax2.axis([0, figure_max_length - 1, 0, crop_i])\n", "ax2.set_xlabel(\"$p$\", fontsize=16)\n", "ax2.set_ylabel(\"$i$\", rotation=0, fontsize=16)\n", "save_fig(\"positional_embedding_plot\")\n", "plt.show()" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "embed_size = 512; max_steps = 500; vocab_size = 10000\n", - "encoder_inputs = tf.keras.layers.Input(shape=[None], dtype=np.int32)\n", - "decoder_inputs = tf.keras.layers.Input(shape=[None], dtype=np.int32)\n", - "embeddings = tf.keras.layers.Embedding(vocab_size, embed_size)\n", - "encoder_embeddings = embeddings(encoder_inputs)\n", - "decoder_embeddings = embeddings(decoder_inputs)\n", - "positional_encoding = PositionalEncoding(max_steps, max_dims=embed_size)\n", - "encoder_in = positional_encoding(encoder_embeddings)\n", - "decoder_in = positional_encoding(decoder_embeddings)" - ] - }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Here is a (very) simplified Transformer (the actual architecture has skip connections, layer norm, dense nets, and most importantly it uses Multi-Head Attention instead of regular Attention):" + "### Multi-Head Attention" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 84, "metadata": {}, "outputs": [], "source": [ + "N = 2 # instead of 6\n", + "num_heads = 8\n", + "dropout_rate = 0.1\n", + "n_units = 128 # for the first Dense layer in each Feed Forward block\n", + "encoder_pad_mask = tf.math.not_equal(encoder_input_ids, 0)[:, tf.newaxis]\n", "Z = encoder_in\n", - "for N in range(6):\n", - " Z = tf.keras.layers.Attention(use_scale=True)([Z, Z])\n", - "\n", - "encoder_outputs = Z\n", - "Z = decoder_in\n", - "for N in range(6):\n", - " Z = tf.keras.layers.Attention(use_scale=True, causal=True)([Z, Z])\n", - " Z = tf.keras.layers.Attention(use_scale=True)([Z, encoder_outputs])\n", - "\n", - "outputs = tf.keras.layers.TimeDistributed(\n", - " tf.keras.layers.Dense(vocab_size, activation=\"softmax\"))(Z)" + "for _ in range(N):\n", + " skip = Z\n", + " attn_layer = tf.keras.layers.MultiHeadAttention(\n", + " num_heads=num_heads, key_dim=embed_size, dropout=dropout_rate)\n", + " Z = attn_layer(Z, value=Z, attention_mask=encoder_pad_mask)\n", + " Z = tf.keras.layers.LayerNormalization()(tf.keras.layers.Add()([Z, skip]))\n", + " skip = Z\n", + " Z = tf.keras.layers.Dense(n_units, activation=\"relu\")(Z)\n", + " Z = tf.keras.layers.Dense(embed_size)(Z)\n", + " Z = tf.keras.layers.Dropout(dropout_rate)(Z)\n", + " Z = tf.keras.layers.LayerNormalization()(tf.keras.layers.Add()([Z, skip]))" + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "metadata": {}, + "outputs": [], + "source": [ + "decoder_pad_mask = tf.math.not_equal(decoder_input_ids, 0)[:, tf.newaxis]\n", + "causal_mask = tf.linalg.band_part( # creates a lower triangular matrix\n", + " tf.ones((batch_max_len_dec, batch_max_len_dec), tf.bool), -1, 0)" + ] + }, + { + "cell_type": "code", + "execution_count": 86, + "metadata": {}, + "outputs": [], + "source": [ + "encoder_outputs = Z # let's save the encoder's final outputs\n", + "Z = decoder_in # the decoder starts with its own inputs\n", + "for _ in range(N):\n", + " skip = Z\n", + " attn_layer = tf.keras.layers.MultiHeadAttention(\n", + " num_heads=num_heads, key_dim=embed_size, dropout=dropout_rate)\n", + " Z = attn_layer(Z, value=Z, attention_mask=causal_mask & decoder_pad_mask)\n", + " Z = tf.keras.layers.LayerNormalization()(tf.keras.layers.Add()([Z, skip]))\n", + " skip = Z\n", + " attn_layer = tf.keras.layers.MultiHeadAttention(\n", + " num_heads=num_heads, key_dim=embed_size, dropout=dropout_rate)\n", + " Z = attn_layer(Z, value=encoder_outputs, attention_mask=encoder_pad_mask)\n", + " Z = tf.keras.layers.LayerNormalization()(tf.keras.layers.Add()([Z, skip]))\n", + " skip = Z\n", + " Z = tf.keras.layers.Dense(n_units, activation=\"relu\")(Z)\n", + " Z = tf.keras.layers.Dense(embed_size)(Z)\n", + " Z = tf.keras.layers.LayerNormalization()(tf.keras.layers.Add()([Z, skip]))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Here's a basic implementation of the `MultiHeadAttention` layer. One will likely be added to `tf.keras.layers` in the near future. Note that `Conv1D` layers with `kernel_size=1` (and the default `padding=\"valid\"` and `strides=1`) is equivalent to a `TimeDistributed(Dense(...))` layer." + "**Warning**: the following cell will take a while to run (possibly 2 or 3 hours if you are not using a GPU)." ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 87, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 1/10\n", + "3125/3125 [==============================] - 828s 263ms/step - loss: 0.2982 - accuracy: 0.5545 - val_loss: 0.2105 - val_accuracy: 0.6476\n", + "Epoch 2/10\n", + "3125/3125 [==============================] - 820s 262ms/step - loss: 0.2006 - accuracy: 0.6601 - val_loss: 0.1876 - val_accuracy: 0.6802\n", + "Epoch 3/10\n", + "3125/3125 [==============================] - 820s 263ms/step - loss: 0.1842 - accuracy: 0.6816 - val_loss: 0.1766 - val_accuracy: 0.6975\n", + "Epoch 4/10\n", + "3125/3125 [==============================] - 820s 262ms/step - loss: 0.1748 - accuracy: 0.6942 - val_loss: 0.1704 - val_accuracy: 0.7055\n", + "Epoch 5/10\n", + "3125/3125 [==============================] - 820s 262ms/step - loss: 0.1683 - accuracy: 0.7021 - val_loss: 0.1657 - val_accuracy: 0.7102\n", + "Epoch 6/10\n", + "3125/3125 [==============================] - 821s 263ms/step - loss: 0.1628 - accuracy: 0.7096 - val_loss: 0.1628 - val_accuracy: 0.7130\n", + "Epoch 7/10\n", + "3125/3125 [==============================] - 826s 264ms/step - loss: 0.1588 - accuracy: 0.7154 - val_loss: 0.1595 - val_accuracy: 0.7205\n", + "Epoch 8/10\n", + "3125/3125 [==============================] - 822s 263ms/step - loss: 0.1550 - accuracy: 0.7205 - val_loss: 0.1590 - val_accuracy: 0.7199\n", + "Epoch 9/10\n", + "3125/3125 [==============================] - 821s 263ms/step - loss: 0.1518 - accuracy: 0.7249 - val_loss: 0.1547 - val_accuracy: 0.7258\n", + "Epoch 10/10\n", + "3125/3125 [==============================] - 821s 263ms/step - loss: 0.1492 - accuracy: 0.7279 - val_loss: 0.1538 - val_accuracy: 0.7281\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 87, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "Y_proba = tf.keras.layers.Dense(vocab_size, activation=\"softmax\")(Z)\n", + "model = tf.keras.Model(inputs=[encoder_inputs, decoder_inputs],\n", + " outputs=[Y_proba])\n", + "model.compile(loss=\"sparse_categorical_crossentropy\", optimizer=\"nadam\",\n", + " metrics=[\"accuracy\"])\n", + "model.fit((X_train, X_train_dec), Y_train, epochs=10,\n", + " validation_data=((X_valid, X_valid_dec), Y_valid))" + ] + }, + { + "cell_type": "code", + "execution_count": 88, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'me gusta el fútbol y yo también voy a la playa'" + ] + }, + "execution_count": 88, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "translate(\"I like soccer and also going to the beach\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# HuggingFace" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Install the Transformers and Datasets libraries if we're running on Colab:" + ] + }, + { + "cell_type": "code", + "execution_count": 89, "metadata": {}, "outputs": [], "source": [ - "K = tf.keras.backend\n", + "if \"google.colab\" in sys.modules:\n", + " %pip install -q -U transformers\n", + " %pip install -q -U datasets" + ] + }, + { + "cell_type": "code", + "execution_count": 90, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english)\n", + "All model checkpoint layers were used when initializing TFDistilBertForSequenceClassification.\n", + "\n", + "All the layers of TFDistilBertForSequenceClassification were initialized from the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english.\n", + "If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.\n" + ] + } + ], + "source": [ + "from transformers import pipeline\n", "\n", - "class MultiHeadAttention(tf.keras.layers.Layer):\n", - " def __init__(self, n_heads, causal=False, use_scale=False, **kwargs):\n", - " self.n_heads = n_heads\n", - " self.causal = causal\n", - " self.use_scale = use_scale\n", - " super().__init__(**kwargs)\n", - " def build(self, batch_input_shape):\n", - " self.dims = batch_input_shape[0][-1]\n", - " self.q_dims, self.v_dims, self.k_dims = [self.dims // self.n_heads] * 3 # could be hyperparameters instead\n", - " self.q_linear = tf.keras.layers.Conv1D(self.n_heads * self.q_dims, kernel_size=1, use_bias=False)\n", - " self.v_linear = tf.keras.layers.Conv1D(self.n_heads * self.v_dims, kernel_size=1, use_bias=False)\n", - " self.k_linear = tf.keras.layers.Conv1D(self.n_heads * self.k_dims, kernel_size=1, use_bias=False)\n", - " self.attention = tf.keras.layers.Attention(causal=self.causal, use_scale=self.use_scale)\n", - " self.out_linear = tf.keras.layers.Conv1D(self.dims, kernel_size=1, use_bias=False)\n", - " super().build(batch_input_shape)\n", - " def _multi_head_linear(self, inputs, linear):\n", - " shape = K.concatenate([K.shape(inputs)[:-1], [self.n_heads, -1]])\n", - " projected = K.reshape(linear(inputs), shape)\n", - " perm = K.permute_dimensions(projected, [0, 2, 1, 3])\n", - " return K.reshape(perm, [shape[0] * self.n_heads, shape[1], -1])\n", - " def call(self, inputs):\n", - " q = inputs[0]\n", - " v = inputs[1]\n", - " k = inputs[2] if len(inputs) > 2 else v\n", - " shape = K.shape(q)\n", - " q_proj = self._multi_head_linear(q, self.q_linear)\n", - " v_proj = self._multi_head_linear(v, self.v_linear)\n", - " k_proj = self._multi_head_linear(k, self.k_linear)\n", - " multi_attended = self.attention([q_proj, v_proj, k_proj])\n", - " shape_attended = K.shape(multi_attended)\n", - " reshaped_attended = K.reshape(multi_attended, [shape[0], self.n_heads, shape_attended[1], shape_attended[2]])\n", - " perm = K.permute_dimensions(reshaped_attended, [0, 2, 1, 3])\n", - " concat = K.reshape(perm, [shape[0], shape_attended[1], -1])\n", - " return self.out_linear(concat)" + "classifier = pipeline(\"sentiment-analysis\") # many other tasks are available\n", + "result = classifier(\"The actors were very convincing.\")" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 91, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "[{'label': 'POSITIVE', 'score': 0.9896161556243896},\n", + " {'label': 'NEGATIVE', 'score': 0.9811071157455444}]" + ] + }, + "execution_count": 91, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "Q = np.random.rand(2, 50, 512)\n", - "V = np.random.rand(2, 80, 512)\n", - "multi_attn = MultiHeadAttention(8)\n", - "multi_attn([Q, V]).shape" + "classifier([\"I am from India.\", \"I am from Iraq.\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 92, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Some layers from the model checkpoint at huggingface/distilbert-base-uncased-finetuned-mnli were not used when initializing TFDistilBertForSequenceClassification: ['dropout_19']\n", + "- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", + "- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", + "Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at huggingface/distilbert-base-uncased-finetuned-mnli and are newly initialized: ['dropout_39']\n", + "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" + ] + }, + { + "data": { + "text/plain": [ + "[{'label': 'contradiction', 'score': 0.9790192246437073}]" + ] + }, + "execution_count": 92, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model_name = \"huggingface/distilbert-base-uncased-finetuned-mnli\"\n", + "classifier_mnli = pipeline(\"text-classification\", model=model_name)\n", + "classifier_mnli(\"She loves me. [SEP] She loves me not.\")" + ] + }, + { + "cell_type": "code", + "execution_count": 93, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Some layers from the model checkpoint at huggingface/distilbert-base-uncased-finetuned-mnli were not used when initializing TFDistilBertForSequenceClassification: ['dropout_19']\n", + "- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", + "- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", + "Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at huggingface/distilbert-base-uncased-finetuned-mnli and are newly initialized: ['dropout_59']\n", + "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" + ] + } + ], + "source": [ + "from transformers import AutoTokenizer, TFAutoModelForSequenceClassification\n", + "\n", + "tokenizer = AutoTokenizer.from_pretrained(model_name)\n", + "model = TFAutoModelForSequenceClassification.from_pretrained(model_name)" + ] + }, + { + "cell_type": "code", + "execution_count": 94, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'input_ids': , 'attention_mask': }" + ] + }, + "execution_count": 94, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "token_ids = tokenizer([\"I like soccer. [SEP] We all love soccer!\",\n", + " \"Joe lived for a very long time. [SEP] Joe is old.\"],\n", + " padding=True, return_tensors=\"tf\")\n", + "token_ids" + ] + }, + { + "cell_type": "code", + "execution_count": 95, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'input_ids': , 'attention_mask': }" + ] + }, + "execution_count": 95, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "token_ids = tokenizer([(\"I like soccer.\", \"We all love soccer!\"),\n", + " (\"Joe lived for a very long time.\", \"Joe is old.\")],\n", + " padding=True, return_tensors=\"tf\")\n", + "token_ids" + ] + }, + { + "cell_type": "code", + "execution_count": 96, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "TFSequenceClassifierOutput(loss=None, logits=, hidden_states=None, attentions=None)" + ] + }, + "execution_count": 96, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "outputs = model(token_ids)\n", + "outputs" + ] + }, + { + "cell_type": "code", + "execution_count": 97, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 97, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "Y_probas = tf.keras.activations.softmax(outputs.logits)\n", + "Y_probas" + ] + }, + { + "cell_type": "code", + "execution_count": 98, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 98, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "Y_pred = tf.argmax(Y_probas, axis=1)\n", + "Y_pred # 0 = contradiction, 1 = entailment, 2 = neutral" + ] + }, + { + "cell_type": "code", + "execution_count": 99, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 1/2\n", + "1/1 [==============================] - 10s 10s/step - loss: 1.1190 - accuracy: 0.5000\n", + "Epoch 2/2\n", + "1/1 [==============================] - 0s 491ms/step - loss: 0.6666 - accuracy: 0.5000\n" + ] + } + ], + "source": [ + "sentences = [(\"Sky is blue\", \"Sky is red\"), (\"I love her\", \"She loves me\")]\n", + "X_train = tokenizer(sentences, padding=True, return_tensors=\"tf\").data\n", + "y_train = tf.constant([0, 2]) # contradiction, neutral\n", + "loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)\n", + "model.compile(loss=loss, optimizer=\"nadam\", metrics=[\"accuracy\"])\n", + "history = model.fit(X_train, y_train, epochs=2)" ] }, { @@ -1420,8 +2939,8 @@ "source": [ "1. Stateless RNNs can only capture patterns whose length is less than, or equal to, the size of the windows the RNN is trained on. Conversely, stateful RNNs can capture longer-term patterns. However, implementing a stateful RNN is much harder⁠—especially preparing the dataset properly. Moreover, stateful RNNs do not always work better, in part because consecutive batches are not independent and identically distributed (IID). Gradient Descent is not fond of non-IID datasets.\n", "2. In general, if you translate a sentence one word at a time, the result will be terrible. For example, the French sentence \"Je vous en prie\" means \"You are welcome,\" but if you translate it one word at a time, you get \"I you in pray.\" Huh? It is much better to read the whole sentence first and then translate it. A plain sequence-to-sequence RNN would start translating a sentence immediately after reading the first word, while an Encoder–Decoder RNN will first read the whole sentence and then translate it. That said, one could imagine a plain sequence-to-sequence RNN that would output silence whenever it is unsure about what to say next (just like human translators do when they must translate a live broadcast).\n", - "3. Variable-length input sequences can be handled by padding the shorter sequences so that all sequences in a batch have the same length, and using masking to ensure the RNN ignores the padding token. For better performance, you may also want to create batches containing sequences of similar sizes. Ragged tensors can hold sequences of variable lengths, and tf.keras will likely support them eventually, which will greatly simplify handling variable-length input sequences (at the time of this writing, it is not the case yet). Regarding variable-length output sequences, if the length of the output sequence is known in advance (e.g., if you know that it is the same as the input sequence), then you just need to configure the loss function so that it ignores tokens that come after the end of the sequence. Similarly, the code that will use the model should ignore tokens beyond the end of the sequence. But generally the length of the output sequence is not known ahead of time, so the solution is to train the model so that it outputs an end-of-sequence token at the end of each sequence.\n", - "4. Beam search is a technique used to improve the performance of a trained Encoder–Decoder model, for example in a neural machine translation system. The algorithm keeps track of a short list of the _k_ most promising output sentences (say, the top three), and at each decoder step it tries to extend them by one word; then it keeps only the _k_ most likely sentences. The parameter _k_ is called the _beam width_: the larger it is, the more CPU and RAM will be used, but also the more accurate the system will be. Instead of greedily choosing the most likely next word at each step to extend a single sentence, this technique allows the system to explore several promising sentences simultaneously. Moreover, this technique lends itself well to parallelization. You can implement beam search fairly easily using TensorFlow Addons.\n", + "3. Variable-length input sequences can be handled by padding the shorter sequences so that all sequences in a batch have the same length, and using masking to ensure the RNN ignores the padding token. For better performance, you may also want to create batches containing sequences of similar sizes. Ragged tensors can hold sequences of variable lengths, and Keras now supports them, which simplifies handling variable-length input sequences (at the time of this writing, it still does not handle ragged tensors as targets on the GPU, though). Regarding variable-length output sequences, if the length of the output sequence is known in advance (e.g., if you know that it is the same as the input sequence), then you just need to configure the loss function so that it ignores tokens that come after the end of the sequence. Similarly, the code that will use the model should ignore tokens beyond the end of the sequence. But generally the length of the output sequence is not known ahead of time, so the solution is to train the model so that it outputs an end-of-sequence token at the end of each sequence.\n", + "4. Beam search is a technique used to improve the performance of a trained Encoder–Decoder model, for example in a neural machine translation system. The algorithm keeps track of a short list of the _k_ most promising output sentences (say, the top three), and at each decoder step it tries to extend them by one word; then it keeps only the _k_ most likely sentences. The parameter _k_ is called the _beam width_: the larger it is, the more CPU and RAM will be used, but also the more accurate the system will be. Instead of greedily choosing the most likely next word at each step to extend a single sentence, this technique allows the system to explore several promising sentences simultaneously. Moreover, this technique lends itself well to parallelization. You can implement beam search by writing a custom memory cell. Alternatively, TensorFlow Addons's seq2seq API provides an implementation.\n", "5. An attention mechanism is a technique initially used in Encoder–Decoder models to give the decoder more direct access to the input sequence, allowing it to deal with longer input sequences. At each decoder time step, the current decoder's state and the full output of the encoder are processed by an alignment model that outputs an alignment score for each input time step. This score indicates which part of the input is most relevant to the current decoder time step. The weighted sum of the encoder output (weighted by their alignment score) is then fed to the decoder, which produces the next decoder state and the output for this time step. The main benefit of using an attention mechanism is the fact that the Encoder–Decoder model can successfully process longer input sequences. Another benefit is that the alignment scores make the model easier to debug and interpret: for example, if the model makes a mistake, you can look at which part of the input it was paying attention to, and this can help diagnose the issue. An attention mechanism is also at the core of the Transformer architecture, in the Multi-Head Attention layers. See the next answer.\n", "6. The most important layer in the Transformer architecture is the Multi-Head Attention layer (the original Transformer architecture contains 18 of them, including 6 Masked Multi-Head Attention layers). It is at the core of language models such as BERT and GPT-2. Its purpose is to allow the model to identify which words are most aligned with each other, and then improve each word's representation using these contextual clues.\n", "7. Sampled softmax is used when training a classification model when there are many classes (e.g., thousands). It computes an approximation of the cross-entropy loss based on the logit predicted by the model for the correct class, and the predicted logits for a sample of incorrect words. This speeds up training considerably compared to computing the softmax over all logits and then estimating the cross-entropy loss. After training, the model can be used normally, using the regular softmax function to compute all the class probabilities based on all the logits." @@ -1444,7 +2963,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 100, "metadata": {}, "outputs": [], "source": [ @@ -1487,9 +3006,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 101, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "BTXXTTVPXTVPXTTVPSE BPVPSE BTXSE BPVVE BPVVE BTSXSE BPTVPXTTTVVE BPVVE BTXSE BTXXVPSE BPTTTTTTTTVVE BTXSE BPVPSE BTXSE BPTVPSE BTXXTVPSE BPVVE BPVVE BPVVE BPTTVVE BPVVE BPVVE BTXXVVE BTXXVVE BTXXVPXVVE " + ] + } + ], "source": [ "np.random.seed(42)\n", "\n", @@ -1506,9 +3033,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 102, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "BTBPTTTVPXTVPXTTVPSETE BPBPTVPSEPE BPBPVVEPE BPBPVPXVVEPE BPBTXXTTTTVVEPE BPBPVPSEPE BPBTXXVPSEPE BPBTSSSSSSSXSEPE BTBPVVETE BPBTXXVVEPE BPBTXXVPSEPE BTBTXXVVETE BPBPVVEPE BPBPVVEPE BPBTSXSEPE BPBPVVEPE BPBPTVPSEPE BPBTXXVVEPE BTBPTVPXVVETE BTBPVVETE BTBTSSSSSSSXXVVETE BPBTSSSXXTTTTVPSEPE BTBPTTVVETE BPBTXXTVVEPE BTBTXSETE " + ] + } + ], "source": [ "np.random.seed(42)\n", "\n", @@ -1525,7 +3060,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 103, "metadata": {}, "outputs": [], "source": [ @@ -1548,9 +3083,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 104, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "BTBPTTTPPXTVPXTTVPSETE BPBTXEEPE BPBPTVVVEPE BPBTSSSSXSETE BPTTXSEPE BTBPVPXTTTTTTEVETE BPBTXXSVEPE BSBPTTVPSETE BPBXVVEPE BEBTXSETE BPBPVPSXPE BTBPVVVETE BPBTSXSETE BPBPTTTPTTTTTVPSEPE BTBTXXTTSTVPSETE BBBTXSETE BPBTPXSEPE BPBPVPXTTTTVPXTVPXVPXTTTVVEVE BTBXXXTVPSETE BEBTSSSSSXXVPXTVVETE BTBXTTVVETE BPBTXSTPE BTBTXXTTTVPSBTE BTBTXSETX BTBTSXSSTE " + ] + } + ], "source": [ "np.random.seed(42)\n", "\n", @@ -1567,7 +3110,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 105, "metadata": {}, "outputs": [], "source": [ @@ -1577,9 +3120,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 106, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "[0, 4, 4, 4, 6, 6, 5, 5, 1, 4, 1]" + ] + }, + "execution_count": 106, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "string_to_ids(\"BTTTXXVVETE\")" ] @@ -1593,15 +3147,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 107, "metadata": {}, "outputs": [], "source": [ "def generate_dataset(size):\n", - " good_strings = [string_to_ids(generate_string(embedded_reber_grammar))\n", - " for _ in range(size // 2)]\n", - " bad_strings = [string_to_ids(generate_corrupted_string(embedded_reber_grammar))\n", - " for _ in range(size - size // 2)]\n", + " good_strings = [\n", + " string_to_ids(generate_string(embedded_reber_grammar))\n", + " for _ in range(size // 2)\n", + " ]\n", + " bad_strings = [\n", + " string_to_ids(generate_corrupted_string(embedded_reber_grammar))\n", + " for _ in range(size - size // 2)\n", + " ]\n", " all_strings = good_strings + bad_strings\n", " X = tf.ragged.constant(all_strings, ragged_rank=1)\n", " y = np.array([[1.] for _ in range(len(good_strings))] +\n", @@ -1611,7 +3169,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 108, "metadata": {}, "outputs": [], "source": [ @@ -1630,9 +3188,22 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 109, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 109, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "X_train[0]" ] @@ -1646,9 +3217,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 110, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "array([1.])" + ] + }, + "execution_count": 110, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "y_train[0]" ] @@ -1662,9 +3244,62 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 111, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 1/20\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "313/313 [==============================] - 4s 8ms/step - loss: 0.6910 - accuracy: 0.5095 - val_loss: 0.6825 - val_accuracy: 0.5645\n", + "Epoch 2/20\n", + "313/313 [==============================] - 2s 7ms/step - loss: 0.6678 - accuracy: 0.5659 - val_loss: 0.6635 - val_accuracy: 0.6105\n", + "Epoch 3/20\n", + "313/313 [==============================] - 2s 7ms/step - loss: 0.6504 - accuracy: 0.5766 - val_loss: 0.6521 - val_accuracy: 0.6110\n", + "Epoch 4/20\n", + "313/313 [==============================] - 2s 8ms/step - loss: 0.6347 - accuracy: 0.5980 - val_loss: 0.6224 - val_accuracy: 0.6445\n", + "Epoch 5/20\n", + "313/313 [==============================] - 2s 7ms/step - loss: 0.6054 - accuracy: 0.6361 - val_loss: 0.5779 - val_accuracy: 0.6980\n", + "Epoch 6/20\n", + "313/313 [==============================] - 2s 7ms/step - loss: 0.5414 - accuracy: 0.7093 - val_loss: 0.4695 - val_accuracy: 0.7795\n", + "Epoch 7/20\n", + "313/313 [==============================] - 2s 7ms/step - loss: 0.3756 - accuracy: 0.8418 - val_loss: 0.2685 - val_accuracy: 0.9115\n", + "Epoch 8/20\n", + "313/313 [==============================] - 2s 7ms/step - loss: 0.2601 - accuracy: 0.9044 - val_loss: 0.1534 - val_accuracy: 0.9615\n", + "Epoch 9/20\n", + "313/313 [==============================] - 2s 7ms/step - loss: 0.1774 - accuracy: 0.9427 - val_loss: 0.1063 - val_accuracy: 0.9735\n", + "Epoch 10/20\n", + "313/313 [==============================] - 2s 7ms/step - loss: 0.0624 - accuracy: 0.9826 - val_loss: 0.0219 - val_accuracy: 0.9975\n", + "Epoch 11/20\n", + "313/313 [==============================] - 2s 7ms/step - loss: 0.0371 - accuracy: 0.9914 - val_loss: 0.0055 - val_accuracy: 1.0000\n", + "Epoch 12/20\n", + "313/313 [==============================] - 2s 7ms/step - loss: 0.0029 - accuracy: 0.9995 - val_loss: 8.7265e-04 - val_accuracy: 1.0000\n", + "Epoch 13/20\n", + "313/313 [==============================] - 2s 7ms/step - loss: 6.7552e-04 - accuracy: 1.0000 - val_loss: 4.9408e-04 - val_accuracy: 1.0000\n", + "Epoch 14/20\n", + "313/313 [==============================] - 2s 7ms/step - loss: 4.4514e-04 - accuracy: 1.0000 - val_loss: 3.6322e-04 - val_accuracy: 1.0000\n", + "Epoch 15/20\n", + "313/313 [==============================] - 2s 7ms/step - loss: 3.3943e-04 - accuracy: 1.0000 - val_loss: 2.8524e-04 - val_accuracy: 1.0000\n", + "Epoch 16/20\n", + "313/313 [==============================] - 2s 7ms/step - loss: 2.7723e-04 - accuracy: 1.0000 - val_loss: 2.3880e-04 - val_accuracy: 1.0000\n", + "Epoch 17/20\n", + "313/313 [==============================] - 2s 7ms/step - loss: 2.3477e-04 - accuracy: 1.0000 - val_loss: 2.0363e-04 - val_accuracy: 1.0000\n", + "Epoch 18/20\n", + "313/313 [==============================] - 2s 7ms/step - loss: 2.0382e-04 - accuracy: 1.0000 - val_loss: 1.7760e-04 - val_accuracy: 1.0000\n", + "Epoch 19/20\n", + "313/313 [==============================] - 2s 7ms/step - loss: 1.8077e-04 - accuracy: 1.0000 - val_loss: 1.5916e-04 - val_accuracy: 1.0000\n", + "Epoch 20/20\n", + "313/313 [==============================] - 2s 8ms/step - loss: 1.6246e-04 - accuracy: 1.0000 - val_loss: 1.4362e-04 - val_accuracy: 1.0000\n" + ] + } + ], "source": [ "np.random.seed(42)\n", "tf.random.set_seed(42)\n", @@ -1673,13 +3308,17 @@ "\n", "model = tf.keras.Sequential([\n", " tf.keras.layers.InputLayer(input_shape=[None], dtype=tf.int32, ragged=True),\n", - " tf.keras.layers.Embedding(input_dim=len(POSSIBLE_CHARS), output_dim=embedding_size),\n", + " tf.keras.layers.Embedding(input_dim=len(POSSIBLE_CHARS),\n", + " output_dim=embedding_size),\n", " tf.keras.layers.GRU(30),\n", " tf.keras.layers.Dense(1, activation=\"sigmoid\")\n", "])\n", - "optimizer = tf.keras.optimizers.SGD(learning_rate=0.02, momentum = 0.95, nesterov=True)\n", - "model.compile(loss=\"binary_crossentropy\", optimizer=optimizer, metrics=[\"accuracy\"])\n", - "history = model.fit(X_train, y_train, epochs=20, validation_data=(X_valid, y_valid))" + "optimizer = tf.keras.optimizers.SGD(learning_rate=0.02, momentum = 0.95,\n", + " nesterov=True)\n", + "model.compile(loss=\"binary_crossentropy\", optimizer=optimizer,\n", + " metrics=[\"accuracy\"])\n", + "history = model.fit(X_train, y_train, epochs=20,\n", + " validation_data=(X_valid, y_valid))" ] }, { @@ -1691,9 +3330,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 112, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Estimated probability that these are Reber strings:\n", + "BPBTSSSSSSSXXTTVPXVPXTTTTTVVETE: 0.02%\n", + "BPBTSSSSSSSXXTTVPXVPXTTTTTVVEPE: 99.99%\n" + ] + } + ], "source": [ "test_strings = [\"BPBTSSSSSSSXXTTVPXVPXTTTTTVVETE\",\n", " \"BPBTSSSSSSSXXTTVPXVPXTTTTTVVEPE\"]\n", @@ -1730,7 +3380,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 113, "metadata": {}, "outputs": [], "source": [ @@ -1761,9 +3411,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 114, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Input Target \n", + "--------------------------------------------------\n", + "September 20, 7075 7075-09-20 \n", + "May 15, 8579 8579-05-15 \n", + "January 11, 7103 7103-01-11 \n" + ] + } + ], "source": [ "np.random.seed(42)\n", "\n", @@ -1784,9 +3446,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 115, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "' ,0123456789ADFJMNOSabceghilmnoprstuvy'" + ] + }, + "execution_count": 115, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "INPUT_CHARS = \"\".join(sorted(set(\"\".join(MONTHS) + \"0123456789, \")))\n", "INPUT_CHARS" @@ -1801,7 +3474,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 116, "metadata": {}, "outputs": [], "source": [ @@ -1817,7 +3490,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 117, "metadata": {}, "outputs": [], "source": [ @@ -1827,25 +3500,47 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 118, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "[19, 23, 31, 34, 23, 28, 21, 23, 32, 0, 4, 2, 1, 0, 9, 2, 9, 7]" + ] + }, + "execution_count": 118, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "date_str_to_ids(x_example[0], INPUT_CHARS)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 119, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "[7, 0, 7, 5, 10, 0, 9, 10, 2, 0]" + ] + }, + "execution_count": 119, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "date_str_to_ids(y_example[0], OUTPUT_CHARS)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 120, "metadata": {}, "outputs": [], "source": [ @@ -1861,7 +3556,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 121, "metadata": {}, "outputs": [], "source": [ @@ -1874,9 +3569,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 122, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 122, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "Y_train[0]" ] @@ -1899,9 +3605,56 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 123, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 1/20\n", + "313/313 [==============================] - 10s 23ms/step - loss: 1.8150 - accuracy: 0.3489 - val_loss: 1.3726 - val_accuracy: 0.4939\n", + "Epoch 2/20\n", + "313/313 [==============================] - 7s 22ms/step - loss: 1.2447 - accuracy: 0.5510 - val_loss: 1.0725 - val_accuracy: 0.6115\n", + "Epoch 3/20\n", + "313/313 [==============================] - 7s 23ms/step - loss: 1.0937 - accuracy: 0.6125 - val_loss: 1.0548 - val_accuracy: 0.6130\n", + "Epoch 4/20\n", + "313/313 [==============================] - 7s 23ms/step - loss: 1.0032 - accuracy: 0.6413 - val_loss: 3.8747 - val_accuracy: 0.1788\n", + "Epoch 5/20\n", + "313/313 [==============================] - 8s 26ms/step - loss: 0.8159 - accuracy: 0.7023 - val_loss: 0.6623 - val_accuracy: 0.7474\n", + "Epoch 6/20\n", + "313/313 [==============================] - 8s 26ms/step - loss: 0.5645 - accuracy: 0.7795 - val_loss: 0.5005 - val_accuracy: 0.8032\n", + "Epoch 7/20\n", + "313/313 [==============================] - 8s 26ms/step - loss: 0.5037 - accuracy: 0.8103 - val_loss: 0.3798 - val_accuracy: 0.8500\n", + "Epoch 8/20\n", + "313/313 [==============================] - 8s 26ms/step - loss: 0.3131 - accuracy: 0.8795 - val_loss: 0.2582 - val_accuracy: 0.9043\n", + "Epoch 9/20\n", + "313/313 [==============================] - 8s 26ms/step - loss: 0.2141 - accuracy: 0.9280 - val_loss: 0.1637 - val_accuracy: 0.9498\n", + "Epoch 10/20\n", + "313/313 [==============================] - 9s 28ms/step - loss: 0.1282 - accuracy: 0.9650 - val_loss: 0.0918 - val_accuracy: 0.9774\n", + "Epoch 11/20\n", + "313/313 [==============================] - 9s 28ms/step - loss: 0.0669 - accuracy: 0.9871 - val_loss: 0.3368 - val_accuracy: 0.8871\n", + "Epoch 12/20\n", + "313/313 [==============================] - 10s 32ms/step - loss: 0.1551 - accuracy: 0.9662 - val_loss: 0.0398 - val_accuracy: 0.9949\n", + "Epoch 13/20\n", + "313/313 [==============================] - 9s 29ms/step - loss: 0.0291 - accuracy: 0.9969 - val_loss: 0.0240 - val_accuracy: 0.9984\n", + "Epoch 14/20\n", + "313/313 [==============================] - 9s 30ms/step - loss: 0.0182 - accuracy: 0.9986 - val_loss: 0.0161 - val_accuracy: 0.9993\n", + "Epoch 15/20\n", + "313/313 [==============================] - 9s 30ms/step - loss: 0.0119 - accuracy: 0.9995 - val_loss: 0.0112 - val_accuracy: 0.9997\n", + "Epoch 16/20\n", + "313/313 [==============================] - 10s 32ms/step - loss: 0.0082 - accuracy: 0.9998 - val_loss: 0.0083 - val_accuracy: 0.9999\n", + "Epoch 17/20\n", + "313/313 [==============================] - 10s 33ms/step - loss: 0.0059 - accuracy: 0.9999 - val_loss: 0.0058 - val_accuracy: 0.9999\n", + "Epoch 18/20\n", + "313/313 [==============================] - 11s 34ms/step - loss: 0.0042 - accuracy: 1.0000 - val_loss: 0.0043 - val_accuracy: 0.9999\n", + "Epoch 19/20\n", + "313/313 [==============================] - 10s 33ms/step - loss: 0.0031 - accuracy: 1.0000 - val_loss: 0.0034 - val_accuracy: 0.9999\n", + "Epoch 20/20\n", + "313/313 [==============================] - 12s 40ms/step - loss: 0.0024 - accuracy: 1.0000 - val_loss: 0.0026 - val_accuracy: 1.0000\n" + ] + } + ], "source": [ "embedding_size = 32\n", "max_output_length = Y_train.shape[1]\n", @@ -1943,7 +3696,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 124, "metadata": {}, "outputs": [], "source": [ @@ -1961,7 +3714,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 125, "metadata": {}, "outputs": [], "source": [ @@ -1970,11 +3723,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 126, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2009-09-17\n", + "1789-07-14\n" + ] + } + ], "source": [ - "#ids = model.predict_classes(X_new)\n", "ids = model.predict(X_new).argmax(axis=-1)\n", "for date_str in ids_to_date_strs(ids):\n", " print(date_str)" @@ -1996,7 +3757,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 127, "metadata": {}, "outputs": [], "source": [ @@ -2005,11 +3766,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 128, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2020-02-02\n", + "1789-01-14\n" + ] + } + ], "source": [ - "#ids = model.predict_classes(X_new)\n", "ids = model.predict(X_new).argmax(axis=-1)\n", "for date_str in ids_to_date_strs(ids):\n", " print(date_str)" @@ -2024,7 +3793,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 129, "metadata": {}, "outputs": [], "source": [ @@ -2038,16 +3807,26 @@ "\n", "def convert_date_strs(date_strs):\n", " X = prepare_date_strs_padded(date_strs)\n", - " #ids = model.predict_classes(X)\n", " ids = model.predict(X).argmax(axis=-1)\n", " return ids_to_date_strs(ids)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 130, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "['2020-05-02', '1789-07-14']" + ] + }, + "execution_count": 130, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "convert_date_strs([\"May 02, 2020\", \"July 14, 1789\"])" ] @@ -2090,7 +3869,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 131, "metadata": {}, "outputs": [], "source": [ @@ -2114,9 +3893,27 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 132, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 132, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "X_train_decoder" ] @@ -2130,9 +3927,36 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 133, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 1/10\n", + "313/313 [==============================] - 11s 27ms/step - loss: 1.6824 - accuracy: 0.3734 - val_loss: 1.4054 - val_accuracy: 0.4681\n", + "Epoch 2/10\n", + "313/313 [==============================] - 8s 26ms/step - loss: 1.1935 - accuracy: 0.5550 - val_loss: 0.8868 - val_accuracy: 0.6750\n", + "Epoch 3/10\n", + "313/313 [==============================] - 8s 26ms/step - loss: 0.6403 - accuracy: 0.7700 - val_loss: 0.3493 - val_accuracy: 0.8978\n", + "Epoch 4/10\n", + "313/313 [==============================] - 8s 26ms/step - loss: 0.2292 - accuracy: 0.9423 - val_loss: 0.1254 - val_accuracy: 0.9782\n", + "Epoch 5/10\n", + "313/313 [==============================] - 8s 26ms/step - loss: 0.0694 - accuracy: 0.9932 - val_loss: 0.0441 - val_accuracy: 0.9982\n", + "Epoch 6/10\n", + "313/313 [==============================] - 9s 29ms/step - loss: 0.0576 - accuracy: 0.9923 - val_loss: 0.0280 - val_accuracy: 0.9988\n", + "Epoch 7/10\n", + "313/313 [==============================] - 8s 26ms/step - loss: 0.0179 - accuracy: 0.9998 - val_loss: 0.0143 - val_accuracy: 0.9999\n", + "Epoch 8/10\n", + "313/313 [==============================] - 6s 18ms/step - loss: 0.0107 - accuracy: 0.9999 - val_loss: 0.0092 - val_accuracy: 0.9999\n", + "Epoch 9/10\n", + "313/313 [==============================] - 6s 20ms/step - loss: 0.0070 - accuracy: 1.0000 - val_loss: 0.0065 - val_accuracy: 0.9999\n", + "Epoch 10/10\n", + "313/313 [==============================] - 6s 18ms/step - loss: 0.0050 - accuracy: 1.0000 - val_loss: 0.0047 - val_accuracy: 0.9999\n" + ] + } + ], "source": [ "encoder_embedding_size = 32\n", "decoder_embedding_size = 32\n", @@ -2184,7 +4008,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 134, "metadata": {}, "outputs": [], "source": [ @@ -2204,9 +4028,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 135, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "['1789-07-14', '2020-05-01']" + ] + }, + "execution_count": 135, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "predict_date_strs([\"July 14, 1789\", \"May 01, 2020\"])" ] @@ -2215,526 +4050,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Works fine! :)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Third version: using TF-Addons's seq2seq implementation" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's build exactly the same model, but using TF-Addon's seq2seq API. The implementation below is almost very similar to the TFA example higher in this notebook, except without the model input to specify the output sequence length, for simplicity (but you can easily add it back in if you need it for your projects, when the output sequences have very different lengths)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import tensorflow_addons as tfa\n", - "\n", - "np.random.seed(42)\n", - "tf.random.set_seed(42)\n", - "\n", - "encoder_embedding_size = 32\n", - "decoder_embedding_size = 32\n", - "units = 128\n", - "\n", - "encoder_inputs = tf.keras.layers.Input(shape=[None], dtype=np.int32)\n", - "decoder_inputs = tf.keras.layers.Input(shape=[None], dtype=np.int32)\n", - "sequence_lengths = tf.keras.layers.Input(shape=[], dtype=np.int32)\n", - "\n", - "encoder_embeddings = tf.keras.layers.Embedding(\n", - " len(INPUT_CHARS) + 1, encoder_embedding_size)(encoder_inputs)\n", - "\n", - "decoder_embedding_layer = tf.keras.layers.Embedding(\n", - " len(OUTPUT_CHARS) + 2, decoder_embedding_size)\n", - "decoder_embeddings = decoder_embedding_layer(decoder_inputs)\n", - "\n", - "encoder = tf.keras.layers.LSTM(units, return_state=True)\n", - "encoder_outputs, state_h, state_c = encoder(encoder_embeddings)\n", - "encoder_state = [state_h, state_c]\n", - "\n", - "sampler = tfa.seq2seq.sampler.TrainingSampler()\n", - "\n", - "decoder_cell = tf.keras.layers.LSTMCell(units)\n", - "output_layer = tf.keras.layers.Dense(len(OUTPUT_CHARS) + 1)\n", - "\n", - "decoder = tfa.seq2seq.basic_decoder.BasicDecoder(decoder_cell,\n", - " sampler,\n", - " output_layer=output_layer)\n", - "final_outputs, final_state, final_sequence_lengths = decoder(\n", - " decoder_embeddings,\n", - " initial_state=encoder_state)\n", - "Y_proba = tf.keras.layers.Activation(\"softmax\")(final_outputs.rnn_output)\n", - "\n", - "model = tf.keras.Model(inputs=[encoder_inputs, decoder_inputs],\n", - " outputs=[Y_proba])\n", - "optimizer = tf.keras.optimizers.Nadam()\n", - "model.compile(loss=\"sparse_categorical_crossentropy\", optimizer=optimizer,\n", - " metrics=[\"accuracy\"])\n", - "history = model.fit([X_train, X_train_decoder], Y_train, epochs=15,\n", - " validation_data=([X_valid, X_valid_decoder], Y_valid))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "And once again, 100% validation accuracy! To use the model, we can just reuse the `predict_date_strs()` function:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "predict_date_strs([\"July 14, 1789\", \"May 01, 2020\"])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "However, there's a much more efficient way to perform inference. Until now, during inference, we've run the model once for each new character. Instead, we can create a new decoder, based on the previously trained layers, but using a `GreedyEmbeddingSampler` instead of a `TrainingSampler`.\n", - "\n", - "At each time step, the `GreedyEmbeddingSampler` will compute the argmax of the decoder's outputs, and run the resulting token IDs through the decoder's embedding layer. Then it will feed the resulting embeddings to the decoder's LSTM cell at the next time step. This way, we only need to run the decoder once to get the full prediction." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "inference_sampler = tfa.seq2seq.sampler.GreedyEmbeddingSampler(\n", - " embedding_fn=decoder_embedding_layer)\n", - "inference_decoder = tfa.seq2seq.basic_decoder.BasicDecoder(\n", - " decoder_cell, inference_sampler, output_layer=output_layer,\n", - " maximum_iterations=max_output_length)\n", - "batch_size = tf.shape(encoder_inputs)[:1]\n", - "start_tokens = tf.fill(dims=batch_size, value=sos_id)\n", - "final_outputs, final_state, final_sequence_lengths = inference_decoder(\n", - " start_tokens,\n", - " initial_state=encoder_state,\n", - " start_tokens=start_tokens,\n", - " end_token=0)\n", - "\n", - "inference_model = tf.keras.Model(inputs=[encoder_inputs],\n", - " outputs=[final_outputs.sample_id])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "A few notes:\n", - "* The `GreedyEmbeddingSampler` needs the `start_tokens` (a vector containing the start-of-sequence ID for each decoder sequence), and the `end_token` (the decoder will stop decoding a sequence once the model outputs this token).\n", - "* We must set `maximum_iterations` when creating the `BasicDecoder`, or else it may run into an infinite loop (if the model never outputs the end token for at least one of the sequences). This would force you would to restart the Jupyter kernel.\n", - "* The decoder inputs are not needed anymore, since all the decoder inputs are generated dynamically based on the outputs from the previous time step.\n", - "* The model's outputs are `final_outputs.sample_id` instead of the softmax of `final_outputs.rnn_outputs`. This allows us to directly get the argmax of the model's outputs. If you prefer to have access to the logits, you can replace `final_outputs.sample_id` with `final_outputs.rnn_outputs`." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now we can write a simple function that uses the model to perform the date format conversion:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def fast_predict_date_strs(date_strs):\n", - " X = prepare_date_strs_padded(date_strs)\n", - " Y_pred = inference_model.predict(X)\n", - " return ids_to_date_strs(Y_pred)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "fast_predict_date_strs([\"July 14, 1789\", \"May 01, 2020\"])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's check that it really is faster:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%timeit predict_date_strs([\"July 14, 1789\", \"May 01, 2020\"])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%timeit fast_predict_date_strs([\"July 14, 1789\", \"May 01, 2020\"])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "That's more than a 10x speedup! And it would be even more if we were handling longer sequences." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Fourth version: using TF-Addons's seq2seq implementation with a scheduled sampler" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Warning**: due to a TF bug, this version only works using TensorFlow 2.2 or above." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "When we trained the previous model, at each time step _t_ we gave the model the target token for time step _t_ - 1. However, at inference time, the model did not get the previous target at each time step. Instead, it got the previous prediction. So there is a discrepancy between training and inference, which may lead to disappointing performance. To alleviate this, we can gradually replace the targets with the predictions, during training. For this, we just need to replace the `TrainingSampler` with a `ScheduledEmbeddingTrainingSampler`, and use a Keras callback to gradually increase the `sampling_probability` (i.e., the probability that the decoder will use the prediction from the previous time step rather than the target for the previous time step)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import tensorflow_addons as tfa\n", - "\n", - "np.random.seed(42)\n", - "tf.random.set_seed(42)\n", - "\n", - "n_epochs = 20\n", - "encoder_embedding_size = 32\n", - "decoder_embedding_size = 32\n", - "units = 128\n", - "\n", - "encoder_inputs = tf.keras.layers.Input(shape=[None], dtype=np.int32)\n", - "decoder_inputs = tf.keras.layers.Input(shape=[None], dtype=np.int32)\n", - "sequence_lengths = tf.keras.layers.Input(shape=[], dtype=np.int32)\n", - "\n", - "encoder_embeddings = tf.keras.layers.Embedding(\n", - " len(INPUT_CHARS) + 1, encoder_embedding_size)(encoder_inputs)\n", - "\n", - "decoder_embedding_layer = tf.keras.layers.Embedding(\n", - " len(OUTPUT_CHARS) + 2, decoder_embedding_size)\n", - "decoder_embeddings = decoder_embedding_layer(decoder_inputs)\n", - "\n", - "encoder = tf.keras.layers.LSTM(units, return_state=True)\n", - "encoder_outputs, state_h, state_c = encoder(encoder_embeddings)\n", - "encoder_state = [state_h, state_c]\n", - "\n", - "sampler = tfa.seq2seq.sampler.ScheduledEmbeddingTrainingSampler(\n", - " sampling_probability=0.,\n", - " embedding_fn=decoder_embedding_layer)\n", - "# we must set the sampling_probability after creating the sampler\n", - "# (see https://github.com/tensorflow/addons/pull/1714)\n", - "sampler.sampling_probability = tf.Variable(0.)\n", - "\n", - "decoder_cell = tf.keras.layers.LSTMCell(units)\n", - "output_layer = tf.keras.layers.Dense(len(OUTPUT_CHARS) + 1)\n", - "\n", - "decoder = tfa.seq2seq.basic_decoder.BasicDecoder(decoder_cell,\n", - " sampler,\n", - " output_layer=output_layer)\n", - "final_outputs, final_state, final_sequence_lengths = decoder(\n", - " decoder_embeddings,\n", - " initial_state=encoder_state)\n", - "Y_proba = tf.keras.layers.Activation(\"softmax\")(final_outputs.rnn_output)\n", - "\n", - "model = tf.keras.Model(inputs=[encoder_inputs, decoder_inputs],\n", - " outputs=[Y_proba])\n", - "optimizer = tf.keras.optimizers.Nadam()\n", - "model.compile(loss=\"sparse_categorical_crossentropy\", optimizer=optimizer,\n", - " metrics=[\"accuracy\"])\n", - "\n", - "def update_sampling_probability(epoch, logs):\n", - " proba = min(1.0, epoch / (n_epochs - 10))\n", - " sampler.sampling_probability.assign(proba)\n", - "\n", - "sampling_probability_cb = tf.keras.callbacks.LambdaCallback(\n", - " on_epoch_begin=update_sampling_probability)\n", - "history = model.fit([X_train, X_train_decoder], Y_train, epochs=n_epochs,\n", - " validation_data=([X_valid, X_valid_decoder], Y_valid),\n", - " callbacks=[sampling_probability_cb])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Not quite 100% validation accuracy, but close enough!" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For inference, we could do the exact same thing as earlier, using a `GreedyEmbeddingSampler`. However, just for the sake of completeness, let's use a `SampleEmbeddingSampler` instead. It's almost the same thing, except that instead of using the argmax of the model's output to find the token ID, it treats the outputs as logits and uses them to sample a token ID randomly. This can be useful when you want to generate text. The `softmax_temperature` argument serves the \n", - "same purpose as when we generated Shakespeare-like text (the higher this argument, the more random the generated text will be)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "softmax_temperature = tf.Variable(1.)\n", - "\n", - "inference_sampler = tfa.seq2seq.sampler.SampleEmbeddingSampler(\n", - " embedding_fn=decoder_embedding_layer,\n", - " softmax_temperature=softmax_temperature)\n", - "inference_decoder = tfa.seq2seq.basic_decoder.BasicDecoder(\n", - " decoder_cell, inference_sampler, output_layer=output_layer,\n", - " maximum_iterations=max_output_length)\n", - "batch_size = tf.shape(encoder_inputs)[:1]\n", - "start_tokens = tf.fill(dims=batch_size, value=sos_id)\n", - "final_outputs, final_state, final_sequence_lengths = inference_decoder(\n", - " start_tokens,\n", - " initial_state=encoder_state,\n", - " start_tokens=start_tokens,\n", - " end_token=0)\n", - "\n", - "inference_model = tf.keras.Model(inputs=[encoder_inputs],\n", - " outputs=[final_outputs.sample_id])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def creative_predict_date_strs(date_strs, temperature=1.0):\n", - " softmax_temperature.assign(temperature)\n", - " X = prepare_date_strs_padded(date_strs)\n", - " Y_pred = inference_model.predict(X)\n", - " return ids_to_date_strs(Y_pred)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "tf.random.set_seed(42)\n", - "\n", - "creative_predict_date_strs([\"July 14, 1789\", \"May 01, 2020\"])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Dates look good at room temperature. Now let's heat things up a bit:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "tf.random.set_seed(42)\n", - "\n", - "creative_predict_date_strs([\"July 14, 1789\", \"May 01, 2020\"],\n", - " temperature=5.)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Oops, the dates are overcooked, now. Let's call them \"creative\" dates." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Fifth version: using TFA seq2seq, the Keras subclassing API and attention mechanisms" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The sequences in this problem are pretty short, but if we wanted to tackle longer sequences, we would probably have to use attention mechanisms. While it's possible to code our own implementation, it's simpler and more efficient to use TF-Addons's implementation instead. Let's do that now, this time using Keras' subclassing API.\n", - "\n", - "**Warning**: due to a TensorFlow bug (see [this issue](https://github.com/tensorflow/addons/issues/1153) for details), the `get_initial_state()` method fails in eager mode, so for now we have to use the subclassing API, as Keras automatically calls `tf.function()` on the `call()` method (so it runs in graph mode)." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In this implementation, we've reverted back to using the `TrainingSampler`, for simplicity (but you can easily tweak it to use a `ScheduledEmbeddingTrainingSampler` instead). We also use a `GreedyEmbeddingSampler` during inference, so this class is pretty easy to use:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "class DateTranslation(tf.keras.Model):\n", - " def __init__(self, units=128, encoder_embedding_size=32,\n", - " decoder_embedding_size=32, **kwargs):\n", - " super().__init__(**kwargs)\n", - " self.encoder_embedding = tf.keras.layers.Embedding(\n", - " input_dim=len(INPUT_CHARS) + 1,\n", - " output_dim=encoder_embedding_size)\n", - " self.encoder = tf.keras.layers.LSTM(units,\n", - " return_sequences=True,\n", - " return_state=True)\n", - " self.decoder_embedding = tf.keras.layers.Embedding(\n", - " input_dim=len(OUTPUT_CHARS) + 2,\n", - " output_dim=decoder_embedding_size)\n", - " self.attention = tfa.seq2seq.LuongAttention(units)\n", - " decoder_inner_cell = tf.keras.layers.LSTMCell(units)\n", - " self.decoder_cell = tfa.seq2seq.AttentionWrapper(\n", - " cell=decoder_inner_cell,\n", - " attention_mechanism=self.attention)\n", - " output_layer = tf.keras.layers.Dense(len(OUTPUT_CHARS) + 1)\n", - " self.decoder = tfa.seq2seq.BasicDecoder(\n", - " cell=self.decoder_cell,\n", - " sampler=tfa.seq2seq.sampler.TrainingSampler(),\n", - " output_layer=output_layer)\n", - " self.inference_decoder = tfa.seq2seq.BasicDecoder(\n", - " cell=self.decoder_cell,\n", - " sampler=tfa.seq2seq.sampler.GreedyEmbeddingSampler(\n", - " embedding_fn=self.decoder_embedding),\n", - " output_layer=output_layer,\n", - " maximum_iterations=max_output_length)\n", - "\n", - " def call(self, inputs, training=None):\n", - " encoder_input, decoder_input = inputs\n", - " encoder_embeddings = self.encoder_embedding(encoder_input)\n", - " encoder_outputs, encoder_state_h, encoder_state_c = self.encoder(\n", - " encoder_embeddings,\n", - " training=training)\n", - " encoder_state = [encoder_state_h, encoder_state_c]\n", - "\n", - " self.attention(encoder_outputs,\n", - " setup_memory=True)\n", - " \n", - " decoder_embeddings = self.decoder_embedding(decoder_input)\n", - "\n", - " decoder_initial_state = self.decoder_cell.get_initial_state(\n", - " decoder_embeddings)\n", - " decoder_initial_state = decoder_initial_state.clone(\n", - " cell_state=encoder_state)\n", - " \n", - " if training:\n", - " decoder_outputs, _, _ = self.decoder(\n", - " decoder_embeddings,\n", - " initial_state=decoder_initial_state,\n", - " training=training)\n", - " else:\n", - " start_tokens = tf.zeros_like(encoder_input[:, 0]) + sos_id\n", - " decoder_outputs, _, _ = self.inference_decoder(\n", - " decoder_embeddings,\n", - " initial_state=decoder_initial_state,\n", - " start_tokens=start_tokens,\n", - " end_token=0)\n", - "\n", - " return tf.nn.softmax(decoder_outputs.rnn_output)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "np.random.seed(42)\n", - "tf.random.set_seed(42)\n", - "\n", - "model = DateTranslation()\n", - "optimizer = tf.keras.optimizers.Nadam()\n", - "model.compile(loss=\"sparse_categorical_crossentropy\", optimizer=optimizer,\n", - " metrics=[\"accuracy\"])\n", - "history = model.fit([X_train, X_train_decoder], Y_train, epochs=25,\n", - " validation_data=([X_valid, X_valid_decoder], Y_valid))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Not quite 100% validation accuracy, but close. It took a bit longer to converge this time, but there were also more parameters and more computations per iteration. And we did not use a scheduled sampler.\n", - "\n", - "To use the model, we can write yet another little function:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def fast_predict_date_strs_v2(date_strs):\n", - " X = prepare_date_strs_padded(date_strs)\n", - " X_decoder = tf.zeros(shape=(len(X), max_output_length), dtype=tf.int32)\n", - " Y_probas = model.predict([X, X_decoder])\n", - " Y_pred = tf.argmax(Y_probas, axis=-1)\n", - " return ids_to_date_strs(Y_pred)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "fast_predict_date_strs_v2([\"July 14, 1789\", \"May 01, 2020\"])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "There are still a few interesting features from TF-Addons that you may want to look at:\n", - "* Using a `BeamSearchDecoder` rather than a `BasicDecoder` for inference. Instead of outputing the character with the highest probability, this decoder keeps track of the several candidates, and keeps only the most likely sequences of candidates (see chapter 16 in the book for more details).\n", - "* Setting masks or specifying `sequence_length` if the input or target sequences may have very different lengths.\n", - "* Using a `ScheduledOutputTrainingSampler`, which gives you more flexibility than the `ScheduledEmbeddingTrainingSampler` to decide how to feed the output at time _t_ to the cell at time _t_+1. By default it feeds the outputs directly to cell, without computing the argmax ID and passing it through an embedding layer. Alternatively, you specify a `next_inputs_fn` function that will be used to convert the cell outputs to inputs at the next step." + "Works fine! Next, feel free to write a Transformer version. :)" ] }, { @@ -2742,14 +4058,14 @@ "metadata": {}, "source": [ "## 10.\n", - "_Exercise: Go through TensorFlow's [Neural Machine Translation with Attention tutorial](https://homl.info/nmttuto)._" + "_Exercise: Go through Keras's tutorial for [Natural language image search with a Dual Encoder](https://homl.info/dualtuto). You will learn how to build a model capable of representing both images and text within the same embedding space. This makes it possible to search for images using a text prompt, like in the [CLIP model](https://openai.com/blog/clip/) by OpenAI._ " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Simply open the Colab and follow its instructions. Alternatively, if you want a simpler example of using TF-Addons's seq2seq implementation for Neural Machine Translation (NMT), look at the solution to the previous question. The last model implementation will give you a simpler example of using TF-Addons to build an NMT model using attention mechanisms." + "Just click the link and follow the instructions." ] }, { @@ -2757,14 +4073,7 @@ "metadata": {}, "source": [ "## 11.\n", - "_Exercise: Use one of the recent language models (e.g., GPT) to generate more convincing Shakespearean text._" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The simplest way to use recent language models is to use the excellent [transformers library](https://huggingface.co/transformers/), open sourced by Hugging Face. It provides many modern neural net architectures (including BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet and more) for Natural Language Processing (NLP), including many pretrained models. It relies on either TensorFlow or PyTorch. Best of all: it's amazingly simple to use." + "_Exercise: Use the Transformers library to download a pretrained language model capable of generating text (e.g., GPT), and try generating more convincing Shakespearean text. You will need to use the model's `generate()` method—see Hugging Face's documentation for more details._" ] }, { @@ -2776,9 +4085,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 136, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "All model checkpoint layers were used when initializing TFOpenAIGPTLMHeadModel.\n", + "\n", + "All the layers of TFOpenAIGPTLMHeadModel were initialized from the model checkpoint at openai-gpt.\n", + "If your task is similar to the task the model of the checkpoint was trained on, you can already use TFOpenAIGPTLMHeadModel for predictions without further training.\n" + ] + } + ], "source": [ "from transformers import TFOpenAIGPTLMHeadModel\n", "\n", @@ -2794,9 +4114,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 137, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.\n" + ] + } + ], "source": [ "from transformers import OpenAIGPTTokenizer\n", "\n", @@ -2812,9 +4140,42 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 138, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "{'input_ids': [3570, 1473], 'attention_mask': [1, 1]}" + ] + }, + "execution_count": 138, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tokenizer(\"hello everyone\")" + ] + }, + { + "cell_type": "code", + "execution_count": 139, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 139, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "prompt_text = \"This royal throne of kings, this sceptred isle\"\n", "encoded_prompt = tokenizer.encode(prompt_text,\n", @@ -2832,9 +4193,50 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 140, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 140, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "num_sequences = 5\n", "length = 40\n", @@ -2862,9 +4264,32 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 141, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "this royal throne of kings, this sceptred isle of the necronomicon is the only place that can unlock it from this dark world. \n", + " i am surprised that i've been able to see it, \" the man named dallon says to\n", + "--------------------------------------------------------------------------------\n", + "this royal throne of kings, this sceptred isle was home to many beloved possessors, such as the mighty astaroth. their wives had been husband and wife to lord teixiara for many generations. \n", + " the high king had his own\n", + "--------------------------------------------------------------------------------\n", + "this royal throne of kings, this sceptred isle is now our home and the land of our fathers!'this was made the standard of the coates, which is at king celebrant's command. \n", + " this was the longest story the coates\n", + "--------------------------------------------------------------------------------\n", + "this royal throne of kings, this sceptred isle has a powerful spirit that can not be severed or erased. it will reign until there is no army in our realm or the light will fade from the sky, and the lands will be stripped of its\n", + "--------------------------------------------------------------------------------\n", + "this royal throne of kings, this sceptred isle will be your final gift to king dragomir. \n", + " good luck, my guards. \n", + " * * * \n", + " a light touch on her arm caused aleria to jolt. \" come on. i think you\n", + "--------------------------------------------------------------------------------\n" + ] + } + ], "source": [ "for sequence in generated_sequences:\n", " text = tokenizer.decode(sequence, clean_up_tokenization_spaces=True)\n", @@ -2903,7 +4328,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.12" + "version": "3.9.10" }, "nav_menu": {}, "toc": {