diff --git a/13_loading_and_preprocessing_data.ipynb b/13_loading_and_preprocessing_data.ipynb index a3bf230..1d7e4cd 100644 --- a/13_loading_and_preprocessing_data.ipynb +++ b/13_loading_and_preprocessing_data.ipynb @@ -103,14 +103,6 @@ "execution_count": 4, "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2022-01-19 17:53:50.433275: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA\n", - "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n" - ] - }, { "data": { "text/plain": [ @@ -3163,27 +3155,27 @@ "cell_type": "code", "execution_count": 115, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-02-20 15:27:32.431462: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA\n", + "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n" + ] + } + ], "source": [ - "tf.keras.backend.clear_session()\n", - "np.random.seed(42)\n", - "tf.random.set_seed(42)" - ] - }, - { - "cell_type": "code", - "execution_count": 116, - "metadata": {}, - "outputs": [], - "source": [ - "train_set = tf.data.Dataset.from_tensor_slices((X_train, y_train)).shuffle(len(X_train))\n", + "tf.random.set_seed(42)\n", + "train_set = tf.data.Dataset.from_tensor_slices((X_train, y_train))\n", + "train_set = train_set.shuffle(len(X_train), seed=42)\n", "valid_set = tf.data.Dataset.from_tensor_slices((X_valid, y_valid))\n", "test_set = tf.data.Dataset.from_tensor_slices((X_test, y_test))" ] }, { "cell_type": "code", - "execution_count": 117, + "execution_count": 116, "metadata": {}, "outputs": [], "source": [ @@ -3200,7 +3192,7 @@ }, { "cell_type": "code", - "execution_count": 118, + "execution_count": 117, "metadata": {}, "outputs": [ { @@ -3243,7 +3235,7 @@ }, { "cell_type": "code", - "execution_count": 119, + "execution_count": 118, "metadata": {}, "outputs": [], "source": [ @@ -3264,7 +3256,7 @@ }, { "cell_type": "code", - "execution_count": 120, + "execution_count": 119, "metadata": {}, "outputs": [], "source": [ @@ -3283,7 +3275,7 @@ }, { "cell_type": "code", - "execution_count": 121, + "execution_count": 120, "metadata": {}, "outputs": [], "source": [ @@ -3313,7 +3305,7 @@ }, { "cell_type": "code", - "execution_count": 122, + "execution_count": 121, "metadata": {}, "outputs": [], "source": [ @@ -3324,12 +3316,12 @@ }, { "cell_type": "code", - "execution_count": 123, + "execution_count": 122, "metadata": {}, "outputs": [ { "data": { - "image/png": "\n", + "image/png": "\n", "text/plain": [ "
" ] @@ -3351,24 +3343,13 @@ }, { "cell_type": "code", - "execution_count": 124, + "execution_count": 123, "metadata": {}, "outputs": [], "source": [ - "tf.keras.backend.clear_session()\n", "tf.random.set_seed(42)\n", - "np.random.seed(42)\n", "\n", - "class Standardization(tf.keras.layers.Layer):\n", - " def adapt(self, data_sample):\n", - " self.means_ = np.mean(data_sample, axis=0, keepdims=True)\n", - " self.stds_ = np.std(data_sample, axis=0, keepdims=True)\n", - " def call(self, inputs):\n", - " return (inputs - self.means_) / (self.stds_ + tf.keras.backend.epsilon())\n", - "\n", - "standardization = Standardization(input_shape=[28, 28])\n", - "# or perhaps soon:\n", - "#standardization = tf.keras.layers.Normalization()\n", + "standardization = tf.keras.layers.Normalization(input_shape=[28, 28])\n", "\n", "sample_image_batches = train_set.take(100).map(lambda image, label: image)\n", "sample_images = np.concatenate(list(sample_image_batches.as_numpy_iterator()),\n", @@ -3387,25 +3368,79 @@ }, { "cell_type": "code", - "execution_count": 125, + "execution_count": 124, "metadata": {}, "outputs": [ { - "ename": "TypeError", - "evalue": "unsupported operand type(s) for +: 'PosixPath' and 'str'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m/var/folders/wy/h39t6kb11pnbb0pzhksd_fqh0000gq/T/ipykernel_76919/164425769.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mdatetime\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mdatetime\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0mlogs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mPath\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m/\u001b[0m \u001b[0;34m\"my_logs\"\u001b[0m \u001b[0;34m/\u001b[0m \u001b[0;34m\"run_\"\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mdatetime\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnow\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstrftime\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"%Y%m%d_%H%M%S\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 4\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m tensorboard_cb = tf.keras.callbacks.TensorBoard(\n", - "\u001b[0;31mTypeError\u001b[0m: unsupported operand type(s) for +: 'PosixPath' and 'str'" + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-02-20 15:30:49.689831: I tensorflow/core/profiler/lib/profiler_session.cc:110] Profiler session initializing.\n", + "2022-02-20 15:30:49.689858: I tensorflow/core/profiler/lib/profiler_session.cc:125] Profiler session started.\n", + "2022-02-20 15:30:49.691427: I tensorflow/core/profiler/lib/profiler_session.cc:143] Profiler session tear down.\n" ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 1/5\n", + " 59/Unknown - 1s 3ms/step - loss: 0.9230 - accuracy: 0.6817" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-02-20 15:30:50.428921: I tensorflow/core/profiler/lib/profiler_session.cc:110] Profiler session initializing.\n", + "2022-02-20 15:30:50.428945: I tensorflow/core/profiler/lib/profiler_session.cc:125] Profiler session started.\n", + "2022-02-20 15:30:50.433359: I tensorflow/core/profiler/lib/profiler_session.cc:67] Profiler session collecting data.\n", + "2022-02-20 15:30:50.446608: I tensorflow/core/profiler/lib/profiler_session.cc:143] Profiler session tear down.\n", + "2022-02-20 15:30:50.461272: I tensorflow/core/profiler/rpc/client/save_profile.cc:136] Creating directory: my_logs/run_/20220220_153049/plugins/profile/2022_02_20_15_30_50\n", + "\n", + "2022-02-20 15:30:50.465450: I tensorflow/core/profiler/rpc/client/save_profile.cc:142] Dumped gzipped tool data for trace.json.gz to my_logs/run_/20220220_153049/plugins/profile/2022_02_20_15_30_50/kiwimac.trace.json.gz\n", + "2022-02-20 15:30:50.480245: I tensorflow/core/profiler/rpc/client/save_profile.cc:136] Creating directory: my_logs/run_/20220220_153049/plugins/profile/2022_02_20_15_30_50\n", + "\n", + "2022-02-20 15:30:50.480582: I tensorflow/core/profiler/rpc/client/save_profile.cc:142] Dumped gzipped tool data for memory_profile.json.gz to my_logs/run_/20220220_153049/plugins/profile/2022_02_20_15_30_50/kiwimac.memory_profile.json.gz\n", + "2022-02-20 15:30:50.482034: I tensorflow/core/profiler/rpc/client/capture_profile.cc:251] Creating directory: my_logs/run_/20220220_153049/plugins/profile/2022_02_20_15_30_50\n", + "Dumped tool data for xplane.pb to my_logs/run_/20220220_153049/plugins/profile/2022_02_20_15_30_50/kiwimac.xplane.pb\n", + "Dumped tool data for overview_page.pb to my_logs/run_/20220220_153049/plugins/profile/2022_02_20_15_30_50/kiwimac.overview_page.pb\n", + "Dumped tool data for input_pipeline.pb to my_logs/run_/20220220_153049/plugins/profile/2022_02_20_15_30_50/kiwimac.input_pipeline.pb\n", + "Dumped tool data for tensorflow_stats.pb to my_logs/run_/20220220_153049/plugins/profile/2022_02_20_15_30_50/kiwimac.tensorflow_stats.pb\n", + "Dumped tool data for kernel_stats.pb to my_logs/run_/20220220_153049/plugins/profile/2022_02_20_15_30_50/kiwimac.kernel_stats.pb\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1719/1719 [==============================] - 5s 2ms/step - loss: 0.4437 - accuracy: 0.8402 - val_loss: 0.3649 - val_accuracy: 0.8682\n", + "Epoch 2/5\n", + "1719/1719 [==============================] - 4s 2ms/step - loss: 0.3333 - accuracy: 0.8775 - val_loss: 0.3346 - val_accuracy: 0.8790\n", + "Epoch 3/5\n", + "1719/1719 [==============================] - 4s 2ms/step - loss: 0.2970 - accuracy: 0.8905 - val_loss: 0.3235 - val_accuracy: 0.8866\n", + "Epoch 4/5\n", + "1719/1719 [==============================] - 4s 2ms/step - loss: 0.2723 - accuracy: 0.8995 - val_loss: 0.3308 - val_accuracy: 0.8888\n", + "Epoch 5/5\n", + "1719/1719 [==============================] - 4s 2ms/step - loss: 0.2534 - accuracy: 0.9047 - val_loss: 0.3174 - val_accuracy: 0.8916\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 124, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ "from datetime import datetime\n", "\n", - "logs = Path() / \"my_logs\" / \"run_\" + datetime.now().strftime(\"%Y%m%d_%H%M%S\")\n", + "logs = Path() / \"my_logs\" / \"run_\" / datetime.now().strftime(\"%Y%m%d_%H%M%S\")\n", "\n", "tensorboard_cb = tf.keras.callbacks.TensorBoard(\n", " log_dir=logs, histogram_freq=1, profile_batch=10)\n", @@ -3416,12 +3451,47 @@ }, { "cell_type": "code", - "execution_count": 126, + "execution_count": 125, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The tensorboard extension is already loaded. To reload it, use:\n", + " %reload_ext tensorboard\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "%load_ext tensorboard\n", - "%tensorboard --logdir=./my_logs --port=6006" + "%tensorboard --logdir=./my_logs" ] }, { @@ -3437,13 +3507,24 @@ }, { "cell_type": "code", - "execution_count": 127, + "execution_count": 126, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "PosixPath('/Users/ageron/.keras/datasets/aclImdb')" + ] + }, + "execution_count": 126, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "from pathlib import Path\n", "\n", - "root = \"http://ai.stanford.edu/~amaas/data/sentiment/\"\n", + "root = \"https://ai.stanford.edu/~amaas/data/sentiment/\"\n", "filename = \"aclImdb_v1.tar.gz\"\n", "filepath = tf.keras.utils.get_file(filename, root + filename, extract=True)\n", "path = Path(filepath).with_name(\"aclImdb\")\n", @@ -3459,7 +3540,7 @@ }, { "cell_type": "code", - "execution_count": 128, + "execution_count": 127, "metadata": {}, "outputs": [], "source": [ @@ -3482,18 +3563,74 @@ }, { "cell_type": "code", - "execution_count": 129, + "execution_count": 128, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/Users/ageron/.keras/datasets/aclImdb/\n", + " test/\n", + " neg/\n", + " 0_2.txt\n", + " 10000_4.txt\n", + " 10001_1.txt\n", + " ...\n", + " pos/\n", + " 0_10.txt\n", + " 10000_7.txt\n", + " 10001_9.txt\n", + " ...\n", + " labeledBow.feat\n", + " urls_neg.txt\n", + " urls_pos.txt\n", + " train/\n", + " neg/\n", + " 0_3.txt\n", + " 10000_4.txt\n", + " 10001_4.txt\n", + " ...\n", + " pos/\n", + " 0_9.txt\n", + " 10000_8.txt\n", + " 10001_10.txt\n", + " ...\n", + " unsup/\n", + " 0_0.txt\n", + " 10000_0.txt\n", + " 10001_0.txt\n", + " ...\n", + " labeledBow.feat\n", + " unsupBow.feat\n", + " urls_neg.txt\n", + " ...\n", + " README\n", + " imdb.vocab\n", + " imdbEr.txt\n" + ] + } + ], "source": [ "tree(path)" ] }, { "cell_type": "code", - "execution_count": 130, + "execution_count": 129, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "(12500, 12500, 12500, 12500)" + ] + }, + "execution_count": 129, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "def review_paths(dirpath):\n", " return [str(path) for path in dirpath.glob(\"*.txt\")]\n", @@ -3516,7 +3653,7 @@ }, { "cell_type": "code", - "execution_count": 131, + "execution_count": 130, "metadata": {}, "outputs": [], "source": [ @@ -3545,7 +3682,7 @@ }, { "cell_type": "code", - "execution_count": 132, + "execution_count": 131, "metadata": {}, "outputs": [], "source": [ @@ -3563,9 +3700,25 @@ }, { "cell_type": "code", - "execution_count": 133, + "execution_count": 132, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tf.Tensor(b\"Working with one of the best Shakespeare sources, this film manages to be creditable to it's source, whilst still appealing to a wider audience.

Branagh steals the film from under Fishburne's nose, and there's a talented cast on good form.\", shape=(), dtype=string)\n", + "tf.Tensor(0, shape=(), dtype=int32)\n", + "\n", + "tf.Tensor(b'Well...tremors I, the original started off in 1990 and i found the movie quite enjoyable to watch. however, they proceeded to make tremors II and III. Trust me, those movies started going downhill right after they finished the first one, i mean, ass blasters??? Now, only God himself is capable of answering the question \"why in Gods name would they create another one of these dumpster dives of a movie?\" Tremors IV cannot be considered a bad movie, in fact it cannot be even considered an epitome of a bad movie, for it lives up to more than that. As i attempted to sit though it, i noticed that my eyes started to bleed, and i hoped profusely that the little girl from the ring would crawl through the TV and kill me. did they really think that dressing the people who had stared in the other movies up as though they we\\'re from the wild west would make the movie (with the exact same occurrences) any better? honestly, i would never suggest buying this movie, i mean, there are cheaper ways to find things that burn well.', shape=(), dtype=string)\n", + "tf.Tensor(0, shape=(), dtype=int32)\n", + "\n", + "tf.Tensor(b\"Ouch! This one was a bit painful to sit through. It has a cute and amusing premise, but it all goes to hell from there. Matthew Modine is almost always pedestrian and annoying, and he does not disappoint in this one. Deborah Kara Unger and John Neville turned in surprisingly decent performances. Alan Bates and Jennifer Tilly, among others, played it way over the top. I know that's the way the parts were written, and it's hard to blame actors, when the script and director have them do such schlock. If you're going to have outrageous characters, that's OK, but you gotta have good material to make it work. It didn't here. Run away screaming from this movie if at all possible.\", shape=(), dtype=string)\n", + "tf.Tensor(0, shape=(), dtype=int32)\n", + "\n" + ] + } + ], "source": [ "for X, y in imdb_dataset(train_pos, train_neg).take(3):\n", " print(X)\n", @@ -3575,9 +3728,17 @@ }, { "cell_type": "code", - "execution_count": 134, + "execution_count": 133, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "29.7 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)\n" + ] + } + ], "source": [ "%timeit -r1 for X, y in imdb_dataset(train_pos, train_neg).repeat(10): pass" ] @@ -3598,7 +3759,7 @@ }, { "cell_type": "code", - "execution_count": 135, + "execution_count": 134, "metadata": {}, "outputs": [], "source": [ @@ -3614,9 +3775,17 @@ }, { "cell_type": "code", - "execution_count": 136, + "execution_count": 135, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "27.7 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)\n" + ] + } + ], "source": [ "%timeit -r1 for X, y in imdb_dataset(train_pos, train_neg).repeat(10): pass" ] @@ -3630,22 +3799,31 @@ }, { "cell_type": "code", - "execution_count": 137, + "execution_count": 136, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "20.6 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)\n" + ] + } + ], "source": [ "%timeit -r1 for X, y in imdb_dataset(train_pos, train_neg).cache().repeat(10): pass" ] }, { "cell_type": "code", - "execution_count": 138, + "execution_count": 137, "metadata": {}, "outputs": [], "source": [ "batch_size = 32\n", "\n", - "train_set = imdb_dataset(train_pos, train_neg).shuffle(25000).batch(batch_size).prefetch(1)\n", + "train_set = imdb_dataset(train_pos, train_neg).shuffle(25000, seed=42)\n", + "train_set = train_set.batch(batch_size).prefetch(1)\n", "valid_set = imdb_dataset(valid_pos, valid_neg).batch(batch_size).prefetch(1)\n", "test_set = imdb_dataset(test_pos, test_neg).batch(batch_size).prefetch(1)" ] @@ -3655,155 +3833,29 @@ "metadata": {}, "source": [ "### d.\n", - "_Exercise: Create a binary classification model, using a `TextVectorization` layer to preprocess each review. If the `TextVectorization` layer is not yet available (or if you like a challenge), try to create your own custom preprocessing layer: you can use the functions in the `tf.strings` package, for example `lower()` to make everything lowercase, `regex_replace()` to replace punctuation with spaces, and `split()` to split words on spaces. You should use a lookup table to output word indices, which must be prepared in the `adapt()` method._" + "_Exercise: Create a binary classification model, using a `TextVectorization` layer to preprocess each review._" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Let's first write a function to preprocess the reviews, cropping them to 300 characters, converting them to lower case, then replacing `
` and all non-letter characters to spaces, splitting the reviews into words, and finally padding or cropping each review so it ends up with exactly `n_words` tokens:" + "Let's create a `TextVectorization` layer and adapt it to the full IMDB training set (if the training set did not fit in RAM, we could just use a smaller sample of the training set by calling `train_set.take(500)`). Let's use TF-IDF for now." ] }, { "cell_type": "code", - "execution_count": 139, + "execution_count": 138, "metadata": {}, "outputs": [], "source": [ - "def preprocess(X_batch, n_words=50):\n", - " shape = tf.shape(X_batch) * tf.constant([1, 0]) + tf.constant([0, n_words])\n", - " Z = tf.strings.substr(X_batch, 0, 300)\n", - " Z = tf.strings.lower(Z)\n", - " Z = tf.strings.regex_replace(Z, b\"\", b\" \")\n", - " Z = tf.strings.regex_replace(Z, b\"[^a-z]\", b\" \")\n", - " Z = tf.strings.split(Z)\n", - " return Z.to_tensor(shape=shape, default_value=b\"\")\n", - "\n", - "X_example = tf.constant([\"It's a great, great movie! I loved it.\", \"It was terrible, run away!!!\"])\n", - "preprocess(X_example)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now let's write a second utility function that will take a data sample with the same format as the output of the `preprocess()` function, and will output the list of the top `max_size` most frequent words, ensuring that the padding token is first:" - ] - }, - { - "cell_type": "code", - "execution_count": 140, - "metadata": {}, - "outputs": [], - "source": [ - "from collections import Counter\n", - "\n", - "def get_vocabulary(data_sample, max_size=1000):\n", - " preprocessed_reviews = preprocess(data_sample).numpy()\n", - " counter = Counter()\n", - " for words in preprocessed_reviews:\n", - " for word in words:\n", - " if word != b\"\":\n", - " counter[word] += 1\n", - " return [b\"\"] + [word for word, count in counter.most_common(max_size)]\n", - "\n", - "get_vocabulary(X_example)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now we are ready to create the `TextVectorization` layer. Its constructor just saves the hyperparameters (`max_vocabulary_size` and `n_oov_buckets`). The `adapt()` method computes the vocabulary using the `get_vocabulary()` function, then it builds a `StaticVocabularyTable` (see Chapter 16 for more details). The `call()` method preprocesses the reviews to get a padded list of words for each review, then it uses the `StaticVocabularyTable` to lookup the index of each word in the vocabulary:" - ] - }, - { - "cell_type": "code", - "execution_count": 141, - "metadata": {}, - "outputs": [], - "source": [ - "class TextVectorization(tf.keras.layers.Layer):\n", - " def __init__(self, max_vocabulary_size=1000, n_oov_buckets=100, dtype=tf.string, **kwargs):\n", - " super().__init__(dtype=dtype, **kwargs)\n", - " self.max_vocabulary_size = max_vocabulary_size\n", - " self.n_oov_buckets = n_oov_buckets\n", - "\n", - " def adapt(self, data_sample):\n", - " self.vocab = get_vocabulary(data_sample, self.max_vocabulary_size)\n", - " words = tf.constant(self.vocab)\n", - " word_ids = tf.range(len(self.vocab), dtype=tf.int64)\n", - " vocab_init = tf.lookup.KeyValueTensorInitializer(words, word_ids)\n", - " self.table = tf.lookup.StaticVocabularyTable(vocab_init, self.n_oov_buckets)\n", - " \n", - " def call(self, inputs):\n", - " preprocessed_inputs = preprocess(inputs)\n", - " return self.table.lookup(preprocessed_inputs)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's try it on our small `X_example` we defined earlier:" - ] - }, - { - "cell_type": "code", - "execution_count": 142, - "metadata": {}, - "outputs": [], - "source": [ - "text_vectorization = TextVectorization()\n", - "\n", - "text_vectorization.adapt(X_example)\n", - "text_vectorization(X_example)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Looks good! As you can see, each review was cleaned up and tokenized, then each word was encoded as its index in the vocabulary (all the 0s correspond to the `` tokens).\n", - "\n", - "Now let's create another `TextVectorization` layer and let's adapt it to the full IMDB training set (if the training set did not fit in RAM, we could just use a smaller sample of the training set by calling `train_set.take(500)`):" - ] - }, - { - "cell_type": "code", - "execution_count": 143, - "metadata": {}, - "outputs": [], - "source": [ - "max_vocabulary_size = 1000\n", - "n_oov_buckets = 100\n", - "\n", - "sample_review_batches = train_set.map(lambda review, label: review)\n", - "sample_reviews = np.concatenate(list(sample_review_batches.as_numpy_iterator()),\n", - " axis=0)\n", - "\n", - "text_vectorization = TextVectorization(max_vocabulary_size, n_oov_buckets,\n", - " input_shape=[])\n", + "max_tokens = 1000\n", + "sample_reviews = train_set.map(lambda review, label: review)\n", + "text_vectorization = tf.keras.layers.TextVectorization(\n", + " max_tokens=max_tokens, output_mode=\"tf_idf\")\n", "text_vectorization.adapt(sample_reviews)" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's run it on the same `X_example`, just to make sure the word IDs are larger now, since the vocabulary is bigger:" - ] - }, - { - "cell_type": "code", - "execution_count": 144, - "metadata": {}, - "outputs": [], - "source": [ - "text_vectorization(X_example)" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -3813,11 +3865,22 @@ }, { "cell_type": "code", - "execution_count": 145, + "execution_count": 139, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "['[UNK]', 'the', 'and', 'a', 'of', 'to', 'is', 'in', 'it', 'i']" + ] + }, + "execution_count": 139, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "text_vectorization.vocab[:10]" + "text_vectorization.get_vocabulary()[:10]" ] }, { @@ -3827,79 +3890,6 @@ "These are the most common words in the reviews." ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now to build our model we will need to encode all these word IDs somehow. One approach is to create bags of words: for each review, and for each word in the vocabulary, we count the number of occurences of that word in the review. For example:" - ] - }, - { - "cell_type": "code", - "execution_count": 146, - "metadata": {}, - "outputs": [], - "source": [ - "simple_example = tf.constant([[1, 3, 1, 0, 0], [2, 2, 0, 0, 0]])\n", - "tf.reduce_sum(tf.one_hot(simple_example, 4), axis=1)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The first review has 2 times the word 0, 2 times the word 1, 0 times the word 2, and 1 time the word 3, so its bag-of-words representation is `[2, 2, 0, 1]`. Similarly, the second review has 3 times the word 0, 0 times the word 1, and so on. Let's wrap this logic in a small custom layer, and let's test it. We'll drop the counts for the word 0, since this corresponds to the `` token, which we don't care about." - ] - }, - { - "cell_type": "code", - "execution_count": 147, - "metadata": {}, - "outputs": [], - "source": [ - "class BagOfWords(tf.keras.layers.Layer):\n", - " def __init__(self, n_tokens, dtype=tf.int32, **kwargs):\n", - " super().__init__(dtype=dtype, **kwargs)\n", - " self.n_tokens = n_tokens\n", - " def call(self, inputs):\n", - " one_hot = tf.one_hot(inputs, self.n_tokens)\n", - " return tf.reduce_sum(one_hot, axis=1)[:, 1:]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's test it:" - ] - }, - { - "cell_type": "code", - "execution_count": 148, - "metadata": {}, - "outputs": [], - "source": [ - "bag_of_words = BagOfWords(n_tokens=4)\n", - "bag_of_words(simple_example)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "It works fine! Now let's create another `BagOfWord` with the right vocabulary size for our training set:" - ] - }, - { - "cell_type": "code", - "execution_count": 149, - "metadata": {}, - "outputs": [], - "source": [ - "n_tokens = max_vocabulary_size + n_oov_buckets + 1 # add 1 for \n", - "bag_of_words = BagOfWords(n_tokens)" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -3909,13 +3899,40 @@ }, { "cell_type": "code", - "execution_count": 150, + "execution_count": 140, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 1/5\n", + "782/782 [==============================] - 4s 4ms/step - loss: 0.4521 - accuracy: 0.8189 - val_loss: 0.3894 - val_accuracy: 0.8419\n", + "Epoch 2/5\n", + "782/782 [==============================] - 4s 4ms/step - loss: 0.3608 - accuracy: 0.8537 - val_loss: 0.7081 - val_accuracy: 0.7643\n", + "Epoch 3/5\n", + "782/782 [==============================] - 4s 4ms/step - loss: 0.3123 - accuracy: 0.8742 - val_loss: 0.3367 - val_accuracy: 0.8569\n", + "Epoch 4/5\n", + "782/782 [==============================] - 4s 4ms/step - loss: 0.2535 - accuracy: 0.8968 - val_loss: 0.5343 - val_accuracy: 0.8040\n", + "Epoch 5/5\n", + "782/782 [==============================] - 4s 4ms/step - loss: 0.1879 - accuracy: 0.9274 - val_loss: 0.3888 - val_accuracy: 0.8439\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 140, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ + "tf.random.set_seed(42)\n", "model = tf.keras.Sequential([\n", " text_vectorization,\n", - " bag_of_words,\n", " tf.keras.layers.Dense(100, activation=\"relu\"),\n", " tf.keras.layers.Dense(1, activation=\"sigmoid\"),\n", "])\n", @@ -3928,7 +3945,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We get about 73.5% accuracy on the validation set after just the first epoch, but after that the model makes no significant progress. We will do better in Chapter 16. For now the point is just to perform efficient preprocessing using `tf.data` and Keras preprocessing layers." + "We get about 84.2% accuracy on the validation set after just the first epoch, but after that the model makes no significant progress. We will do better in Chapter 16. For now the point is just to perform efficient preprocessing using `tf.data` and Keras preprocessing layers." ] }, { @@ -3948,9 +3965,22 @@ }, { "cell_type": "code", - "execution_count": 151, + "execution_count": 141, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 141, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "def compute_mean_embedding(inputs):\n", " not_pad = tf.math.count_nonzero(inputs, axis=-1)\n", @@ -3972,9 +4002,20 @@ }, { "cell_type": "code", - "execution_count": 152, + "execution_count": 142, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 142, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "tf.reduce_mean(another_example[0:1, :2], axis=1) * tf.sqrt(2.)" ] @@ -3988,9 +4029,20 @@ }, { "cell_type": "code", - "execution_count": 153, + "execution_count": 143, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 143, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "tf.reduce_mean(another_example[1:2, :1], axis=1) * tf.sqrt(1.)" ] @@ -3999,22 +4051,27 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Perfect. Now we're ready to train our final model. It's the same as before, except we replaced the `BagOfWords` layer with an `Embedding` layer followed by a `Lambda` layer that calls the `compute_mean_embedding` layer:" + "Perfect. Now we're ready to train our final model. It's the same as before, except we replaced TF-IDF with ordinal encoding (`output_mode=\"int\"`) followed by an `Embedding` layer, followed by a `Lambda` layer that calls the `compute_mean_embedding` layer:" ] }, { "cell_type": "code", - "execution_count": 154, + "execution_count": 144, "metadata": {}, "outputs": [], "source": [ "embedding_size = 20\n", + "tf.random.set_seed(42)\n", + "\n", + "text_vectorization = tf.keras.layers.TextVectorization(\n", + " max_tokens=max_tokens, output_mode=\"int\")\n", + "text_vectorization.adapt(sample_reviews)\n", "\n", "model = tf.keras.Sequential([\n", " text_vectorization,\n", - " tf.keras.layers.Embedding(input_dim=n_tokens,\n", - " output_dim=embedding_size,\n", - " mask_zero=True), # tokens => zero vectors\n", + " tf.keras.layers.Embedding(input_dim=max_tokens,\n", + " output_dim=embedding_size,\n", + " mask_zero=True), # tokens => zero vectors\n", " tf.keras.layers.Lambda(compute_mean_embedding),\n", " tf.keras.layers.Dense(100, activation=\"relu\"),\n", " tf.keras.layers.Dense(1, activation=\"sigmoid\"),\n", @@ -4031,11 +4088,39 @@ }, { "cell_type": "code", - "execution_count": 155, + "execution_count": 145, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 1/5\n", + "782/782 [==============================] - 9s 10ms/step - loss: 0.4758 - accuracy: 0.7675 - val_loss: 0.4153 - val_accuracy: 0.8009\n", + "Epoch 2/5\n", + "782/782 [==============================] - 8s 9ms/step - loss: 0.3438 - accuracy: 0.8537 - val_loss: 0.3814 - val_accuracy: 0.8245\n", + "Epoch 3/5\n", + "782/782 [==============================] - 8s 10ms/step - loss: 0.3244 - accuracy: 0.8618 - val_loss: 0.3341 - val_accuracy: 0.8520\n", + "Epoch 4/5\n", + "782/782 [==============================] - 10s 11ms/step - loss: 0.3153 - accuracy: 0.8666 - val_loss: 0.3122 - val_accuracy: 0.8655\n", + "Epoch 5/5\n", + "782/782 [==============================] - 11s 12ms/step - loss: 0.3135 - accuracy: 0.8676 - val_loss: 0.3119 - val_accuracy: 0.8625\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 145, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "model.compile(loss=\"binary_crossentropy\", optimizer=\"nadam\", metrics=[\"accuracy\"])\n", + "model.compile(loss=\"binary_crossentropy\", optimizer=\"nadam\",\n", + " metrics=[\"accuracy\"])\n", "model.fit(train_set, epochs=5, validation_data=valid_set)" ] }, @@ -4043,7 +4128,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The model is not better using embeddings (but we will do better in Chapter 16). The pipeline looks fast enough (we optimized it earlier)." + "The model is just marginally better using embeddings (but we will do better in Chapter 16). The pipeline looks fast enough (we optimized it earlier)." ] }, { @@ -4056,7 +4141,7 @@ }, { "cell_type": "code", - "execution_count": 156, + "execution_count": 146, "metadata": {}, "outputs": [], "source": [ @@ -4068,9 +4153,18 @@ }, { "cell_type": "code", - "execution_count": 157, + "execution_count": 147, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tf.Tensor(b\"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it.\", shape=(), dtype=string)\n", + "tf.Tensor(0, shape=(), dtype=int64)\n" + ] + } + ], "source": [ "for example in train_set.take(1):\n", " print(example[\"text\"])\n", @@ -4083,86 +4177,6 @@ "metadata": {}, "outputs": [], "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# TODO: remove?" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Notice that field 4 is interpreted as a string." - ] - }, - { - "cell_type": "code", - "execution_count": 158, - "metadata": {}, - "outputs": [], - "source": [ - "record_defaults=[0, np.nan, tf.constant(np.nan, dtype=tf.float64), \"Hello\", tf.constant([])]\n", - "parsed_fields = tf.io.decode_csv('1,2,3,4,5', record_defaults)\n", - "parsed_fields" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Notice that all missing fields are replaced with their default value, when provided:" - ] - }, - { - "cell_type": "code", - "execution_count": 159, - "metadata": {}, - "outputs": [], - "source": [ - "parsed_fields = tf.io.decode_csv(',,,,5', record_defaults)\n", - "parsed_fields" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The 5th field is compulsory (since we provided `tf.constant([])` as the \"default value\"), so we get an exception if we do not provide it:" - ] - }, - { - "cell_type": "code", - "execution_count": 160, - "metadata": {}, - "outputs": [], - "source": [ - "try:\n", - " parsed_fields = tf.io.decode_csv(',,,,', record_defaults)\n", - "except tf.errors.InvalidArgumentError as ex:\n", - " print(ex)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The number of fields should match exactly the number of fields in the `record_defaults`:" - ] - }, - { - "cell_type": "code", - "execution_count": 161, - "metadata": {}, - "outputs": [], - "source": [ - "try:\n", - " parsed_fields = tf.io.decode_csv('1,2,3,4,5,6,7', record_defaults)\n", - "except tf.errors.InvalidArgumentError as ex:\n", - " print(ex)" - ] } ], "metadata": {