From 1c2421fc8892975ad258127f00346507767102e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Geron?= Date: Sat, 19 Feb 2022 18:07:03 +1300 Subject: [PATCH] Big update of chapter 15 for 3rd edition --- ...essing_sequences_using_rnns_and_cnns.ipynb | 1832 ++++++++++------- 1 file changed, 1065 insertions(+), 767 deletions(-) diff --git a/15_processing_sequences_using_rnns_and_cnns.ipynb b/15_processing_sequences_using_rnns_and_cnns.ipynb index 111ee7b..35f13da 100644 --- a/15_processing_sequences_using_rnns_and_cnns.ipynb +++ b/15_processing_sequences_using_rnns_and_cnns.ipynb @@ -4,14 +4,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "**Chapter 14 – Processing Sequences Using RNNs and CNNs**" + "**Chapter 15 – Processing Sequences Using RNNs and CNNs**" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "_This notebook contains all the sample code and solutions to the exercises in chapter 14._" + "_This notebook contains all the sample code and solutions to the exercises in chapter 15._" ] }, { @@ -20,74 +20,125 @@ "source": [ "\n", " \n", " \n", "
\n", - " \"Open\n", + " \"Open\n", " \n", - " \n", + " \n", "
" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "id": "8IPbJEmZpKzu" + }, "source": [ - "# Setup" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "First, let's import a few common modules, ensure MatplotLib plots figures inline and prepare a function to save the figures." + "This project requires Python 3.8 or above:" ] }, { "cell_type": "code", "execution_count": 1, - "metadata": {}, + "metadata": { + "id": "TFSU3FCOpKzu" + }, "outputs": [], "source": [ - "# Python ≥3.8 is required\n", "import sys\n", - "assert sys.version_info >= (3, 8)\n", "\n", - "# Is this notebook running on Colab or Kaggle?\n", - "IS_COLAB = \"google.colab\" in sys.modules\n", - "IS_KAGGLE = \"kaggle_secrets\" in sys.modules\n", + "assert sys.version_info >= (3, 8)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "TAlKky09pKzv" + }, + "source": [ + "It also requires Scikit-Learn ≥ 1.0.1:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "id": "YqCwW7cMpKzw" + }, + "outputs": [], + "source": [ + "import sklearn\n", "\n", - "# Common imports\n", - "import numpy as np\n", + "assert sklearn.__version__ >= \"1.0.1\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "GJtVEqxfpKzw" + }, + "source": [ + "And TensorFlow ≥ 2.6:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "id": "0Piq5se2pKzx" + }, + "outputs": [], + "source": [ + "import tensorflow as tf\n", + "\n", + "assert tf.__version__ >= \"2.6.0\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DDaDoLQTpKzx" + }, + "source": [ + "As we did in earlier chapters, let's define the default font sizes to make the figures prettier:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "id": "8d4TH3NbpKzx" + }, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "plt.rc('font', size=14)\n", + "plt.rc('axes', labelsize=14, titlesize=14)\n", + "plt.rc('legend', fontsize=14)\n", + "plt.rc('xtick', labelsize=10)\n", + "plt.rc('ytick', labelsize=10)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "RcoUIRsvpKzy" + }, + "source": [ + "And let's create the `images/rnn` folder (if it doesn't already exist), and define the `save_fig()` function which is used through this notebook to save the figures in high-res for the book:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "id": "PQFH5Y9PpKzy" + }, + "outputs": [], + "source": [ "from pathlib import Path\n", "\n", - "# Scikit-Learn ≥1.0 is required\n", - "import sklearn\n", - "assert sklearn.__version__ >= \"1.0\"\n", - "\n", - "# TensorFlow ≥2.6 is required\n", - "import tensorflow as tf\n", - "assert tf.__version__ >= \"2.6\"\n", - "\n", - "# to make this notebook's output stable across runs\n", - "np.random.seed(42)\n", - "tf.random.set_seed(42)\n", - "\n", - "if not tf.config.list_physical_devices('GPU'):\n", - " print(\"No GPU was detected. Neural nets can be very slow without a GPU.\")\n", - " if IS_COLAB:\n", - " print(\"Go to Runtime > Change runtime and select a GPU hardware accelerator.\")\n", - " if IS_KAGGLE:\n", - " print(\"Go to Settings > Accelerator and select GPU.\")\n", - "\n", - "# To plot pretty figures\n", - "%matplotlib inline\n", - "import matplotlib as mpl\n", - "import matplotlib.pyplot as plt\n", - "mpl.rc('axes', labelsize=14)\n", - "mpl.rc('xtick', labelsize=12)\n", - "mpl.rc('ytick', labelsize=12)\n", - "\n", - "# Where to save the figures\n", "IMAGES_PATH = Path() / \"images\" / \"rnn\"\n", "IMAGES_PATH.mkdir(parents=True, exist_ok=True)\n", "\n", @@ -98,6 +149,32 @@ " plt.savefig(path, format=fig_extension, dpi=resolution)" ] }, + { + "cell_type": "markdown", + "metadata": { + "id": "YTsawKlapKzy" + }, + "source": [ + "This chapter can be very slow without a GPU, so let's make sure there's one, or else issue a warning:" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "id": "Ekxzo6pOpKzy" + }, + "outputs": [], + "source": [ + "if not tf.config.list_physical_devices('GPU'):\n", + " print(\"No GPU was detected. Neural nets can be very slow without a GPU.\")\n", + " if \"google.colab\" in sys.modules:\n", + " print(\"Go to Runtime > Change runtime and select a GPU hardware \"\n", + " \"accelerator.\")\n", + " if \"kaggle_secrets\" in sys.modules:\n", + " print(\"Go to Settings > Accelerator and select GPU.\")" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -109,109 +186,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Generate the Dataset" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "def generate_time_series(batch_size, n_steps):\n", - " freq1, freq2, offsets1, offsets2 = np.random.rand(4, batch_size, 1)\n", - " time = np.linspace(0, 1, n_steps)\n", - " series = 0.5 * np.sin((time - offsets1) * (freq1 * 10 + 10)) # wave 1\n", - " series += 0.2 * np.sin((time - offsets2) * (freq2 * 20 + 20)) # + wave 2\n", - " series += 0.1 * (np.random.rand(batch_size, n_steps) - 0.5) # + noise\n", - " return series[..., np.newaxis].astype(np.float32)" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "np.random.seed(42)\n", - "\n", - "n_steps = 50\n", - "series = generate_time_series(10000, n_steps + 1)\n", - "X_train, y_train = series[:7000, :n_steps], series[:7000, -1]\n", - "X_valid, y_valid = series[7000:9000, :n_steps], series[7000:9000, -1]\n", - "X_test, y_test = series[9000:, :n_steps], series[9000:, -1]" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "X_train.shape, y_train.shape" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "def plot_series(series, y=None, y_pred=None, x_label=\"$t$\", y_label=\"$x(t)$\", legend=True):\n", - " plt.plot(series, \".-\")\n", - " if y is not None:\n", - " plt.plot(n_steps, y, \"bo\", label=\"Target\")\n", - " if y_pred is not None:\n", - " plt.plot(n_steps, y_pred, \"rx\", markersize=10, label=\"Prediction\")\n", - " plt.grid(True)\n", - " if x_label:\n", - " plt.xlabel(x_label, fontsize=16)\n", - " if y_label:\n", - " plt.ylabel(y_label, fontsize=16, rotation=0)\n", - " plt.hlines(0, 0, 100, linewidth=1)\n", - " plt.axis([0, n_steps + 1, -1, 1])\n", - " if legend and (y or y_pred):\n", - " plt.legend(fontsize=14, loc=\"upper left\")\n", - "\n", - "fig, axes = plt.subplots(nrows=1, ncols=3, sharey=True, figsize=(12, 4))\n", - "for col in range(3):\n", - " plt.sca(axes[col])\n", - " plot_series(X_valid[col, :, 0], y_valid[col, 0],\n", - " y_label=(\"$x(t)$\" if col==0 else None),\n", - " legend=(col == 0))\n", - "save_fig(\"time_series_plot\")\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Note**: in this notebook, the blue dots represent targets, and red crosses represent predictions. In the book, I first used blue crosses for targets and red dots for predictions, then I reversed this later in the chapter. Sorry if this caused some confusion." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Computing Some Baselines" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Naive predictions (just predict the last observed value):" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "y_pred = X_valid[:, -1]\n", - "np.mean(tf.keras.losses.mean_squared_error(y_valid, y_pred))" + "Let's load ridership data from Chicago's Transit Authority (available on [Chicago's Data Portal](https://homl.info/ridership)." ] }, { @@ -220,15 +195,15 @@ "metadata": {}, "outputs": [], "source": [ - "plot_series(X_valid[0, :, 0], y_valid[0, 0], y_pred[0, 0])\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Linear predictions:" + "import pandas as pd\n", + "from pathlib import Path\n", + "\n", + "path = Path(\"datasets/ridership/CTA_-_Ridership_-_Daily_Boarding_Totals.csv\")\n", + "df = pd.read_csv(path, parse_dates=[\"service_date\"])\n", + "df.columns = [\"date\", \"day_type\", \"bus\", \"rail\", \"total\"] # shorter names\n", + "df = df.sort_values(\"date\").set_index(\"date\")\n", + "df = df.drop(\"total\", axis=1) # no need for total, it's just bus + rail\n", + "df = df.drop_duplicates() # remove duplicated months (2011-10 and 2014-07)" ] }, { @@ -237,17 +212,14 @@ "metadata": {}, "outputs": [], "source": [ - "np.random.seed(42)\n", - "tf.random.set_seed(42)\n", - "\n", - "model = tf.keras.Sequential([\n", - " tf.keras.layers.Flatten(input_shape=[50, 1]),\n", - " tf.keras.layers.Dense(1)\n", - "])\n", - "\n", - "model.compile(loss=\"mse\", optimizer=\"adam\")\n", - "history = model.fit(X_train, y_train, epochs=20,\n", - " validation_data=(X_valid, y_valid))" + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's look at the first few months of 2019 (note that Pandas treats the range boundaries as inclusive):" ] }, { @@ -256,7 +228,11 @@ "metadata": {}, "outputs": [], "source": [ - "model.evaluate(X_valid, y_valid)" + "import matplotlib.pyplot as plt\n", + "\n", + "df[\"2019-03\":\"2019-05\"].plot(grid=True, marker=\".\", figsize=(8, 3.5))\n", + "save_fig(\"daily_ridership_plot\") # extra code – saves the figure for the book\n", + "plt.show()" ] }, { @@ -265,31 +241,339 @@ "metadata": {}, "outputs": [], "source": [ - "def plot_learning_curves(loss, val_loss):\n", - " plt.plot(np.arange(len(loss)) + 0.5, loss, \"b.-\", label=\"Training loss\")\n", - " plt.plot(np.arange(len(val_loss)) + 1, val_loss, \"r.-\", label=\"Validation loss\")\n", - " plt.gca().xaxis.set_major_locator(mpl.ticker.MaxNLocator(integer=True))\n", - " plt.axis([1, 20, 0, 0.05])\n", - " plt.legend(fontsize=14)\n", - " plt.xlabel(\"Epochs\")\n", - " plt.ylabel(\"Loss\")\n", - " plt.grid(True)\n", + "diff_7 = df[[\"bus\", \"rail\"]].diff(7)[\"2019-03\":\"2019-05\"]\n", "\n", - "plot_learning_curves(history.history[\"loss\"], history.history[\"val_loss\"])\n", + "fig, axs = plt.subplots(2, 1, sharex=True, figsize=(8, 5))\n", + "df.plot(ax=axs[0], legend=False, marker=\".\") # original time series\n", + "df.shift(7).plot(ax=axs[0], grid=True, legend=False, linestyle=\":\") # lagged\n", + "diff_7.plot(ax=axs[1], grid=True, marker=\".\") # 7-day difference time series\n", + "axs[0].set_ylim([170_000, 900_000]) # extra code – beautifies the plot\n", + "save_fig(\"differencing_plot\") # extra code – saves the figure for the book\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 11, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "list(df.loc[\"2019-05-25\":\"2019-05-27\"][\"day_type\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Mean absolute error (MAE), also called mean absolute deviation (MAD):" + ] + }, + { + "cell_type": "code", + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ - "y_pred = model.predict(X_valid)\n", - "plot_series(X_valid[0, :, 0], y_valid[0, 0], y_pred[0, 0])\n", + "diff_7.abs().mean()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Mean absolute percentage error (MAPE):" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "targets = df[[\"bus\", \"rail\"]][\"2019-03\":\"2019-05\"]\n", + "(diff_7 / targets).abs().mean()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now let's look at the yearly seasonality and the long-term trends:" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "period = slice(\"2001\", \"2019\")\n", + "df_monthly = df.resample('M').mean() # compute the mean for each month\n", + "rolling_average_12_months = df_monthly[period].rolling(window=12).mean()\n", + "\n", + "fig, ax = plt.subplots(figsize=(8, 4))\n", + "df_monthly[period].plot(ax=ax, marker=\".\")\n", + "rolling_average_12_months.plot(ax=ax, grid=True, legend=False)\n", + "save_fig(\"long_term_ridership_plot\") # extra code – saves the figure for the book\n", "plt.show()" ] }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "df_monthly.diff(12)[period].plot(grid=True, marker=\".\", figsize=(8, 3))\n", + "save_fig(\"yearly_diff_plot\") # extra code – saves the figure for the book\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "from statsmodels.tsa.arima.model import ARIMA\n", + "\n", + "origin, today = \"2019-01-01\", \"2019-05-31\"\n", + "rail_series = df.loc[origin:today][\"rail\"].asfreq(\"D\")\n", + "model = ARIMA(rail_series,\n", + " order=(1, 0, 0),\n", + " seasonal_order=(0, 1, 1, 7))\n", + "model = model.fit()\n", + "y_pred = model.forecast() # returns 427,758.6" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "y_pred[0] # ARIMA forecast" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "df[\"rail\"].loc[\"2019-06-01\"] # target value" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "df[\"rail\"].loc[\"2019-05-25\"] # naive forecast (value from one week earlier)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "origin, start_date, end_date = \"2019-01-01\", \"2019-03-01\", \"2019-05-31\"\n", + "time_period = pd.date_range(start_date, end_date)\n", + "rail_series = df.loc[origin:end_date][\"rail\"].asfreq(\"D\")\n", + "y_preds = []\n", + "for today in time_period.shift(-1):\n", + " model = ARIMA(rail_series[origin:today], # train on data up to \"today\"\n", + " order=(1, 0, 0),\n", + " seasonal_order=(0, 1, 1, 7))\n", + " model = model.fit() # note that we retrain the model every day!\n", + " y_pred = model.forecast()[0]\n", + " y_preds.append(y_pred)\n", + "\n", + "y_preds = pd.Series(y_preds, index=time_period)\n", + "mae = (y_preds - rail_series[time_period]).abs().mean() # returns 32,040.7" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "mae" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "# extra code – displays the SARIMA forecasts\n", + "fig, ax = plt.subplots(figsize=(8, 3))\n", + "rail_series.loc[time_period].plot(label=\"True\", ax=ax, marker=\".\", grid=True)\n", + "ax.plot(y_preds, color=\"r\", marker=\".\", label=\"SARIMA Forecasts\")\n", + "plt.legend()\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "# extra code – shows how to plot the Autocorrelation Function (ACF) and the\n", + "# Partial Autocorrelation Function (PACF)\n", + "\n", + "from statsmodels.graphics.tsaplots import plot_acf, plot_pacf\n", + "\n", + "fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(15, 5))\n", + "plot_acf(df[period][\"rail\"], ax=axs[0], lags=35)\n", + "axs[0].grid()\n", + "plot_pacf(df[period][\"rail\"], ax=axs[1], lags=35, method=\"ywm\")\n", + "axs[1].grid()\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "import tensorflow as tf\n", + "\n", + "my_series = [0, 1, 2, 3, 4, 5]\n", + "my_dataset = tf.keras.utils.timeseries_dataset_from_array(\n", + " my_series,\n", + " targets=my_series[3:], # the targets are 3 steps into the future\n", + " sequence_length=3,\n", + " batch_size=2\n", + ")\n", + "list(my_dataset)" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "for window_dataset in tf.data.Dataset.range(6).window(4, shift=1):\n", + " for element in window_dataset:\n", + " print(f\"{element}\", end=\" \")\n", + " print()" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [], + "source": [ + "dataset = tf.data.Dataset.range(6).window(4, shift=1, drop_remainder=True)\n", + "dataset = dataset.flat_map(lambda window_dataset: window_dataset.batch(4))\n", + "for window_tensor in dataset:\n", + " print(f\"{window_tensor}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [], + "source": [ + "def to_windows(dataset, length):\n", + " dataset = dataset.window(length, shift=1, drop_remainder=True)\n", + " return dataset.flat_map(lambda window_ds: window_ds.batch(length))" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [], + "source": [ + "dataset = to_windows(tf.data.Dataset.range(6), 4)\n", + "dataset = dataset.map(lambda window: (window[:-1], window[-1]))\n", + "list(dataset.batch(2))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Before we continue looking at the data, let's split the time series into three periods, for training, validation and testing. We won't look at the test data for now:" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "rail_train = df[\"rail\"][\"2016-01\":\"2018-12\"] / 1e6\n", + "rail_valid = df[\"rail\"][\"2019-01\":\"2019-05\"] / 1e6\n", + "rail_test = df[\"rail\"][\"2019-06\":] / 1e6" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [], + "source": [ + "seq_length = 56\n", + "tf.random.set_seed(42) # extra code – ensures reproducibility\n", + "train_ds = tf.keras.utils.timeseries_dataset_from_array(\n", + " rail_train.to_numpy(),\n", + " targets=rail_train[seq_length:],\n", + " sequence_length=seq_length,\n", + " batch_size=32,\n", + " shuffle=True,\n", + " seed=42\n", + ")\n", + "valid_ds = tf.keras.utils.timeseries_dataset_from_array(\n", + " rail_valid.to_numpy(),\n", + " targets=rail_valid[seq_length:],\n", + " sequence_length=seq_length,\n", + " batch_size=32\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [], + "source": [ + "tf.random.set_seed(42)\n", + "model = tf.keras.Sequential([\n", + " tf.keras.layers.Dense(1, input_shape=[seq_length])\n", + "])\n", + "early_stopping_cb = tf.keras.callbacks.EarlyStopping(\n", + " monitor=\"val_mae\", patience=50, restore_best_weights=True)\n", + "opt = tf.keras.optimizers.SGD(learning_rate=0.02, momentum=0.9)\n", + "model.compile(loss=tf.keras.losses.Huber(), optimizer=opt, metrics=[\"mae\"])\n", + "history = model.fit(train_ds, validation_data=valid_ds, epochs=500,\n", + " callbacks=[early_stopping_cb])" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [], + "source": [ + "# extra code – evaluates the model\n", + "valid_loss, valid_mae = model.evaluate(valid_ds)\n", + "valid_mae * 1e6" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -299,51 +583,65 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 33, "metadata": {}, "outputs": [], "source": [ - "np.random.seed(42)\n", - "tf.random.set_seed(42)\n", - "\n", + "tf.random.set_seed(42) # extra code – ensures reproducibility\n", "model = tf.keras.Sequential([\n", " tf.keras.layers.SimpleRNN(1, input_shape=[None, 1])\n", - "])\n", + "])" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [], + "source": [ + "# extra code – defines a utility function we'll reuse several time\n", "\n", - "optimizer = tf.keras.optimizers.Adam(learning_rate=0.005)\n", - "model.compile(loss=\"mse\", optimizer=optimizer)\n", - "history = model.fit(X_train, y_train, epochs=20,\n", - " validation_data=(X_valid, y_valid))" + "def fit_and_evaluate(model, train_set, valid_set, learning_rate, epochs=500):\n", + " early_stopping_cb = tf.keras.callbacks.EarlyStopping(\n", + " monitor=\"val_mae\", patience=50, restore_best_weights=True)\n", + " opt = tf.keras.optimizers.SGD(learning_rate=learning_rate, momentum=0.9)\n", + " model.compile(loss=tf.keras.losses.Huber(), optimizer=opt, metrics=[\"mae\"])\n", + " history = model.fit(train_set, validation_data=valid_set, epochs=epochs,\n", + " callbacks=[early_stopping_cb])\n", + " valid_loss, valid_mae = model.evaluate(valid_set)\n", + " return valid_mae * 1e6" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 35, "metadata": {}, "outputs": [], "source": [ - "model.evaluate(X_valid, y_valid)" + "fit_and_evaluate(model, train_ds, valid_ds, learning_rate=0.02)" ] }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 36, "metadata": {}, "outputs": [], "source": [ - "plot_learning_curves(history.history[\"loss\"], history.history[\"val_loss\"])\n", - "plt.show()" + "tf.random.set_seed(42) # extra code – ensures reproducibility\n", + "univar_model = tf.keras.Sequential([\n", + " tf.keras.layers.SimpleRNN(32, input_shape=[None, 1]),\n", + " tf.keras.layers.Dense(1) # no activation function by default\n", + "])" ] }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 37, "metadata": {}, "outputs": [], "source": [ - "y_pred = model.predict(X_valid)\n", - "plot_series(X_valid[0, :, 0], y_valid[0, 0], y_pred[0, 0])\n", - "plt.show()" + "# extra code – compiles, fits, and evaluates the model, like earlier\n", + "fit_and_evaluate(univar_model, train_ds, valid_ds, learning_rate=0.05)" ] }, { @@ -355,109 +653,166 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 38, "metadata": {}, "outputs": [], "source": [ - "np.random.seed(42)\n", - "tf.random.set_seed(42)\n", - "\n", - "model = tf.keras.Sequential([\n", - " tf.keras.layers.SimpleRNN(20, return_sequences=True, input_shape=[None, 1]),\n", - " tf.keras.layers.SimpleRNN(20, return_sequences=True),\n", - " tf.keras.layers.SimpleRNN(1)\n", - "])\n", - "\n", - "model.compile(loss=\"mse\", optimizer=\"adam\")\n", - "history = model.fit(X_train, y_train, epochs=20,\n", - " validation_data=(X_valid, y_valid))" + "tf.random.set_seed(42) # extra code – ensures reproducibility\n", + "deep_model = tf.keras.Sequential([\n", + " tf.keras.layers.SimpleRNN(32, return_sequences=True, input_shape=[None, 1]),\n", + " tf.keras.layers.SimpleRNN(32, return_sequences=True),\n", + " tf.keras.layers.SimpleRNN(32),\n", + " tf.keras.layers.Dense(1)\n", + "])" ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 39, "metadata": {}, "outputs": [], "source": [ - "model.evaluate(X_valid, y_valid)" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [], - "source": [ - "plot_learning_curves(history.history[\"loss\"], history.history[\"val_loss\"])\n", - "plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [], - "source": [ - "y_pred = model.predict(X_valid)\n", - "plot_series(X_valid[0, :, 0], y_valid[0, 0], y_pred[0, 0])\n", - "plt.show()" + "# extra code – compiles, fits, and evaluates the model, like earlier\n", + "fit_and_evaluate(deep_model, train_ds, valid_ds, learning_rate=0.01)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Make the second `SimpleRNN` layer return only the last output:" + "## Multivariate time series" ] }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 40, "metadata": {}, "outputs": [], "source": [ - "np.random.seed(42)\n", + "df_mulvar = df[[\"bus\", \"rail\"]] / 1e6 # use both bus & rail series as input\n", + "df_mulvar[\"next_day_type\"] = df[\"day_type\"].shift(-1) # we know tomorrow's type\n", + "df_mulvar = pd.get_dummies(df_mulvar) # one-hot encode the day type" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [], + "source": [ + "mulvar_train = df_mulvar[\"2016-01\":\"2018-12\"]\n", + "mulvar_valid = df_mulvar[\"2019-01\":\"2019-05\"]\n", + "mulvar_test = df_mulvar[\"2019-06\":]" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [], + "source": [ + "tf.random.set_seed(42) # extra code – ensures reproducibility\n", + "\n", + "train_mulvar_ds = tf.keras.utils.timeseries_dataset_from_array(\n", + " mulvar_train.to_numpy(), # use all 5 columns as input\n", + " targets=mulvar_train[\"rail\"][seq_length:], # forecast only the rail series\n", + " sequence_length=seq_length,\n", + " batch_size=32,\n", + " shuffle=True,\n", + " seed=42\n", + ")\n", + "valid_mulvar_ds = tf.keras.utils.timeseries_dataset_from_array(\n", + " mulvar_valid.to_numpy(),\n", + " targets=mulvar_valid[\"rail\"][seq_length:],\n", + " sequence_length=seq_length,\n", + " batch_size=32\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [], + "source": [ + "tf.random.set_seed(42) # extra code – ensures reproducibility\n", + "mulvar_model = tf.keras.Sequential([\n", + " tf.keras.layers.SimpleRNN(32, input_shape=[None, 5]),\n", + " tf.keras.layers.Dense(1)\n", + "])" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [], + "source": [ + "# extra code – compiles, fits, and evaluates the model, like earlier\n", + "fit_and_evaluate(mulvar_model, train_mulvar_ds, valid_mulvar_ds,\n", + " learning_rate=0.05)" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [], + "source": [ + "# extra code – build and train a multitask RNN that forecasts both bus and rail\n", + "\n", "tf.random.set_seed(42)\n", "\n", - "model = tf.keras.Sequential([\n", - " tf.keras.layers.SimpleRNN(20, return_sequences=True, input_shape=[None, 1]),\n", - " tf.keras.layers.SimpleRNN(20),\n", - " tf.keras.layers.Dense(1)\n", + "seq_length = 56\n", + "train_multask_ds = tf.keras.utils.timeseries_dataset_from_array(\n", + " mulvar_train.to_numpy(),\n", + " targets=mulvar_train[[\"bus\", \"rail\"]][seq_length:], # 2 targets per day\n", + " sequence_length=seq_length,\n", + " batch_size=32,\n", + " shuffle=True,\n", + " seed=42\n", + ")\n", + "valid_multask_ds = tf.keras.utils.timeseries_dataset_from_array(\n", + " mulvar_valid.to_numpy(),\n", + " targets=mulvar_valid[[\"bus\", \"rail\"]][seq_length:],\n", + " sequence_length=seq_length,\n", + " batch_size=32\n", + ")\n", + "\n", + "tf.random.set_seed(42)\n", + "multask_model = tf.keras.Sequential([\n", + " tf.keras.layers.SimpleRNN(32, input_shape=[None, 5]),\n", + " tf.keras.layers.Dense(2)\n", "])\n", "\n", - "model.compile(loss=\"mse\", optimizer=\"adam\")\n", - "history = model.fit(X_train, y_train, epochs=20,\n", - " validation_data=(X_valid, y_valid))" + "fit_and_evaluate(multask_model, train_multask_ds, valid_multask_ds,\n", + " learning_rate=0.02)" ] }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 46, "metadata": {}, "outputs": [], "source": [ - "model.evaluate(X_valid, y_valid)" + "# extra code – evaluates the naive forecasts for bus\n", + "bus_naive = mulvar_valid[\"bus\"].shift(7)[seq_length:]\n", + "bus_target = mulvar_valid[\"bus\"][seq_length:]\n", + "(bus_target - bus_naive).abs().mean() * 1e6" ] }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 47, "metadata": {}, "outputs": [], "source": [ - "plot_learning_curves(history.history[\"loss\"], history.history[\"val_loss\"])\n", - "plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [], - "source": [ - "y_pred = model.predict(X_valid)\n", - "plot_series(X_valid[0, :, 0], y_valid[0, 0], y_pred[0, 0])\n", - "plt.show()" + "# extra code – evaluates the multitask RNN's forecasts both bus and rail\n", + "Y_preds_valid = multask_model.predict(valid_multask_ds)\n", + "for idx, name in enumerate([\"bus\", \"rail\"]):\n", + " mae = 1e6 * tf.keras.metrics.mean_absolute_error(\n", + " mulvar_valid[name][seq_length:], Y_preds_valid[:, idx])\n", + " print(name, int(mae))" ] }, { @@ -469,47 +824,41 @@ }, { "cell_type": "code", - "execution_count": 24, - "metadata": {}, + "execution_count": 48, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ - "np.random.seed(43) # not 42, as it would give the first series in the train set\n", + "import numpy as np\n", "\n", - "series = generate_time_series(1, n_steps + 10)\n", - "X_new, Y_new = series[:, :n_steps], series[:, n_steps:]\n", - "X = X_new\n", - "for step_ahead in range(10):\n", - " y_pred_one = model.predict(X[:, step_ahead:])[:, np.newaxis, :]\n", - " X = np.concatenate([X, y_pred_one], axis=1)\n", - "\n", - "Y_pred = X[:, n_steps:]" + "X = rail_valid.to_numpy()[np.newaxis, :seq_length, np.newaxis]\n", + "for step_ahead in range(14):\n", + " y_pred_one = univar_model.predict(X)\n", + " X = np.concatenate([X, y_pred_one.reshape(1, 1, 1)], axis=1)" ] }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 49, "metadata": {}, "outputs": [], "source": [ - "Y_pred.shape" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [], - "source": [ - "def plot_multiple_forecasts(X, Y, Y_pred):\n", - " n_steps = X.shape[1]\n", - " ahead = Y.shape[1]\n", - " plot_series(X[0, :, 0])\n", - " plt.plot(np.arange(n_steps, n_steps + ahead), Y[0, :, 0], \"bo-\", label=\"Actual\")\n", - " plt.plot(np.arange(n_steps, n_steps + ahead), Y_pred[0, :, 0], \"rx-\", label=\"Forecast\", markersize=10)\n", - " plt.axis([0, n_steps + ahead, -1, 1])\n", - " plt.legend(fontsize=14)\n", + "# extra code – generates and saves Figure 15–11\n", "\n", - "plot_multiple_forecasts(X_new, Y_new, Y_pred)\n", + "# The forecasts start on 2019-02-26, as it is the 57th day of 2019, and they end\n", + "# on 2019-03-11. That's 14 days in total.\n", + "Y_pred = pd.Series(X[0, -14:, 0],\n", + " index=pd.date_range(\"2019-02-26\", \"2019-03-11\"))\n", + "\n", + "fig, ax = plt.subplots(figsize=(8, 3.5))\n", + "(rail_valid * 1e6)[\"2019-02-01\":\"2019-03-11\"].plot(\n", + " label=\"True\", marker=\".\", ax=ax)\n", + "(Y_pred * 1e6).plot(\n", + " label=\"Predictions\", grid=True, marker=\"x\", color=\"r\", ax=ax)\n", + "ax.vlines(\"2019-02-25\", 0, 1e6, color=\"k\", linestyle=\"--\", label=\"Today\")\n", + "ax.set_ylim([200_000, 800_000])\n", + "plt.legend(loc=\"center left\")\n", "save_fig(\"forecast_ahead_plot\")\n", "plt.show()" ] @@ -518,259 +867,192 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Now let's use this model to predict the next 10 values. We first need to regenerate the sequences with 9 more time steps." + "Now let's create an RNN that predicts all 14 next values at once:" ] }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 50, "metadata": {}, "outputs": [], "source": [ - "np.random.seed(42)\n", + "tf.random.set_seed(42) # extra code – ensures reproducibility\n", "\n", - "n_steps = 50\n", - "series = generate_time_series(10000, n_steps + 10)\n", - "X_train, Y_train = series[:7000, :n_steps], series[:7000, -10:, 0]\n", - "X_valid, Y_valid = series[7000:9000, :n_steps], series[7000:9000, -10:, 0]\n", - "X_test, Y_test = series[9000:, :n_steps], series[9000:, -10:, 0]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now let's predict the next 10 values one by one:" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "metadata": {}, - "outputs": [], - "source": [ - "X = X_valid\n", - "for step_ahead in range(10):\n", - " y_pred_one = model.predict(X)[:, np.newaxis, :]\n", - " X = np.concatenate([X, y_pred_one], axis=1)\n", + "def split_inputs_and_targets(mulvar_series, ahead=14, target_col=1):\n", + " return mulvar_series[:, :-ahead], mulvar_series[:, -ahead:, target_col]\n", "\n", - "Y_pred = X[:, n_steps:, 0]" + "ahead_train_ds = tf.keras.utils.timeseries_dataset_from_array(\n", + " mulvar_train.to_numpy(),\n", + " targets=None,\n", + " sequence_length=seq_length + 14,\n", + " batch_size=32,\n", + " shuffle=True,\n", + " seed=42\n", + ").map(split_inputs_and_targets)\n", + "ahead_valid_ds = tf.keras.utils.timeseries_dataset_from_array(\n", + " mulvar_valid.to_numpy(),\n", + " targets=None,\n", + " sequence_length=seq_length + 14,\n", + " batch_size=32\n", + ").map(split_inputs_and_targets)" ] }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 51, "metadata": {}, "outputs": [], "source": [ - "Y_pred.shape" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": {}, - "outputs": [], - "source": [ - "np.mean(tf.keras.metrics.mean_squared_error(Y_valid, Y_pred))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's compare this performance with some baselines: naive predictions and a simple linear model:" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": {}, - "outputs": [], - "source": [ - "Y_naive_pred = np.tile(X_valid[:, -1], 10) # take the last time step value, and repeat it 10 times\n", - "np.mean(tf.keras.metrics.mean_squared_error(Y_valid, Y_naive_pred))" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "metadata": {}, - "outputs": [], - "source": [ - "np.random.seed(42)\n", "tf.random.set_seed(42)\n", "\n", - "model = tf.keras.Sequential([\n", - " tf.keras.layers.Flatten(input_shape=[50, 1]),\n", - " tf.keras.layers.Dense(10)\n", - "])\n", - "\n", - "model.compile(loss=\"mse\", optimizer=\"adam\")\n", - "history = model.fit(X_train, Y_train, epochs=20,\n", - " validation_data=(X_valid, Y_valid))" + "ahead_model = tf.keras.Sequential([\n", + " tf.keras.layers.SimpleRNN(32, input_shape=[None, 5]),\n", + " tf.keras.layers.Dense(14)\n", + "])" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [], + "source": [ + "# extra code – compiles, fits, and evaluates the model, like earlier\n", + "fit_and_evaluate(ahead_model, ahead_train_ds, ahead_valid_ds,\n", + " learning_rate=0.02)" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [], + "source": [ + "X = mulvar_valid.to_numpy()[np.newaxis, :seq_length] # shape [1, 56, 5]\n", + "Y_pred = ahead_model.predict(X) # shape [1, 14]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Now let's create an RNN that predicts all 10 next values at once:" - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "metadata": {}, - "outputs": [], - "source": [ - "np.random.seed(42)\n", - "tf.random.set_seed(42)\n", - "\n", - "model = tf.keras.Sequential([\n", - " tf.keras.layers.SimpleRNN(20, return_sequences=True, input_shape=[None, 1]),\n", - " tf.keras.layers.SimpleRNN(20),\n", - " tf.keras.layers.Dense(10)\n", - "])\n", - "\n", - "model.compile(loss=\"mse\", optimizer=\"adam\")\n", - "history = model.fit(X_train, Y_train, epochs=20,\n", - " validation_data=(X_valid, Y_valid))" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "metadata": {}, - "outputs": [], - "source": [ - "np.random.seed(43)\n", - "\n", - "series = generate_time_series(1, 50 + 10)\n", - "X_new, Y_new = series[:, :50, :], series[:, -10:, :]\n", - "Y_pred = model.predict(X_new)[..., np.newaxis]" - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "metadata": {}, - "outputs": [], - "source": [ - "plot_multiple_forecasts(X_new, Y_new, Y_pred)\n", - "plt.show()" + "Now let's create an RNN that predicts the next 14 steps at each time step. That is, instead of just forecasting time steps 56 to 69 based on time steps 0 to 55, it will forecast time steps 1 to 14 at time step 0, then time steps 2 to 15 at time step 1, and so on, and finally it will forecast time steps 56 to 69 at the last time step. Notice that the model is causal: when it makes predictions at any time step, it can only see past time steps." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Now let's create an RNN that predicts the next 10 steps at each time step. That is, instead of just forecasting time steps 50 to 59 based on time steps 0 to 49, it will forecast time steps 1 to 10 at time step 0, then time steps 2 to 11 at time step 1, and so on, and finally it will forecast time steps 50 to 59 at the last time step. Notice that the model is causal: when it makes predictions at any time step, it can only see past time steps." + "To prepare the datasets, we can use `to_windows()` twice, to get sequences of consecutive windows, like this:" ] }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 54, "metadata": {}, "outputs": [], "source": [ - "np.random.seed(42)\n", - "\n", - "n_steps = 50\n", - "series = generate_time_series(10000, n_steps + 10)\n", - "X_train = series[:7000, :n_steps]\n", - "X_valid = series[7000:9000, :n_steps]\n", - "X_test = series[9000:, :n_steps]\n", - "Y = np.empty((10000, n_steps, 10))\n", - "for step_ahead in range(1, 10 + 1):\n", - " Y[..., step_ahead - 1] = series[..., step_ahead:step_ahead + n_steps, 0]\n", - "Y_train = Y[:7000]\n", - "Y_valid = Y[7000:9000]\n", - "Y_test = Y[9000:]" - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "metadata": {}, - "outputs": [], - "source": [ - "X_train.shape, Y_train.shape" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "metadata": {}, - "outputs": [], - "source": [ - "np.random.seed(42)\n", - "tf.random.set_seed(42)\n", - "\n", - "model = tf.keras.Sequential([\n", - " tf.keras.layers.SimpleRNN(20, return_sequences=True, input_shape=[None, 1]),\n", - " tf.keras.layers.SimpleRNN(20, return_sequences=True),\n", - " tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(10))\n", - "])\n", - "\n", - "def last_time_step_mse(Y_true, Y_pred):\n", - " return tf.keras.metrics.mean_squared_error(Y_true[:, -1], Y_pred[:, -1])\n", - "\n", - "model.compile(loss=\"mse\", optimizer=tf.keras.optimizers.Adam(learning_rate=0.01), metrics=[last_time_step_mse])\n", - "history = model.fit(X_train, Y_train, epochs=20,\n", - " validation_data=(X_valid, Y_valid))" - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "metadata": {}, - "outputs": [], - "source": [ - "np.random.seed(43)\n", - "\n", - "series = generate_time_series(1, 50 + 10)\n", - "X_new, Y_new = series[:, :50, :], series[:, 50:, :]\n", - "Y_pred = model.predict(X_new)[:, -1][..., np.newaxis]" - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "metadata": {}, - "outputs": [], - "source": [ - "plot_multiple_forecasts(X_new, Y_new, Y_pred)\n", - "plt.show()" + "my_series = tf.data.Dataset.range(7)\n", + "dataset = to_windows(to_windows(my_series, 3), 4)\n", + "list(dataset)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "# Deep RNN with Batch Norm" + "Then we can split these elements into the desired inputs and targets:" ] }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 55, "metadata": {}, "outputs": [], "source": [ - "np.random.seed(42)\n", - "tf.random.set_seed(42)\n", - "\n", - "model = tf.keras.Sequential([\n", - " tf.keras.layers.SimpleRNN(20, return_sequences=True, input_shape=[None, 1]),\n", - " tf.keras.layers.BatchNormalization(),\n", - " tf.keras.layers.SimpleRNN(20, return_sequences=True),\n", - " tf.keras.layers.BatchNormalization(),\n", - " tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(10))\n", - "])\n", - "\n", - "model.compile(loss=\"mse\", optimizer=\"adam\", metrics=[last_time_step_mse])\n", - "history = model.fit(X_train, Y_train, epochs=20,\n", - " validation_data=(X_valid, Y_valid))" + "dataset = dataset.map(lambda S: (S[:, 0], S[:, 1:]))\n", + "list(dataset)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's wrap this idea into a utility function. It will also take care of shuffling (optional) and batching:" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [], + "source": [ + "def to_seq2seq_dataset(series, seq_length=56, ahead=14, target_col=1,\n", + " batch_size=32, shuffle=False, seed=None):\n", + " ds = to_windows(tf.data.Dataset.from_tensor_slices(series), ahead + 1)\n", + " ds = to_windows(ds, seq_length).map(lambda S: (S[:, 0], S[:, 1:, 1]))\n", + " if shuffle:\n", + " ds = ds.shuffle(8 * batch_size, seed=seed)\n", + " return ds.batch(batch_size)" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": {}, + "outputs": [], + "source": [ + "seq2seq_train = to_seq2seq_dataset(mulvar_train, shuffle=True, seed=42)\n", + "seq2seq_valid = to_seq2seq_dataset(mulvar_valid)" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": {}, + "outputs": [], + "source": [ + "tf.random.set_seed(42) # extra code – ensures reproducibility\n", + "seq2seq_model = tf.keras.Sequential([\n", + " tf.keras.layers.SimpleRNN(32, return_sequences=True, input_shape=[None, 5]),\n", + " tf.keras.layers.Dense(14)\n", + " # equivalent: tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(14))\n", + " # also equivalent: tf.keras.layers.Conv1D(14, kernel_size=1)\n", + "])" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": {}, + "outputs": [], + "source": [ + "fit_and_evaluate(seq2seq_model, seq2seq_train, seq2seq_valid,\n", + " learning_rate=0.1)" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": {}, + "outputs": [], + "source": [ + "X = mulvar_valid.to_numpy()[np.newaxis, :seq_length]\n", + "y_pred_14 = seq2seq_model.predict(X)[0, -1] # only the last time step's output" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": {}, + "outputs": [], + "source": [ + "Y_pred_valid = seq2seq_model.predict(seq2seq_valid)\n", + "for ahead in range(14):\n", + " preds = pd.Series(Y_pred_valid[:-1, -1, ahead],\n", + " index=mulvar_valid.index[56 + ahead : -14 + ahead])\n", + " mae = (preds - mulvar_valid[\"rail\"]).abs().mean() * 1e6\n", + " print(f\"MAE for +{ahead + 1}: {mae:,.0f}\")" ] }, { @@ -782,16 +1064,7 @@ }, { "cell_type": "code", - "execution_count": 42, - "metadata": {}, - "outputs": [], - "source": [ - "from tensorflow.keras.layers import LayerNormalization" - ] - }, - { - "cell_type": "code", - "execution_count": 43, + "execution_count": 62, "metadata": {}, "outputs": [], "source": [ @@ -801,14 +1074,10 @@ " self.state_size = units\n", " self.output_size = units\n", " self.simple_rnn_cell = tf.keras.layers.SimpleRNNCell(units,\n", - " activation=None)\n", - " self.layer_norm = LayerNormalization()\n", + " activation=None)\n", + " self.layer_norm = tf.keras.layers.LayerNormalization()\n", " self.activation = tf.keras.activations.get(activation)\n", - " def get_initial_state(self, inputs=None, batch_size=None, dtype=None):\n", - " if inputs is not None:\n", - " batch_size = tf.shape(inputs)[0]\n", - " dtype = inputs.dtype\n", - " return [tf.zeros([batch_size, self.state_size], dtype=dtype)]\n", + "\n", " def call(self, inputs, states):\n", " outputs, new_states = self.simple_rnn_cell(inputs, states)\n", " norm_outputs = self.activation(self.layer_norm(outputs))\n", @@ -817,35 +1086,52 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 63, "metadata": {}, "outputs": [], "source": [ - "np.random.seed(42)\n", - "tf.random.set_seed(42)\n", - "\n", - "model = tf.keras.Sequential([\n", - " tf.keras.layers.RNN(LNSimpleRNNCell(20), return_sequences=True,\n", - " input_shape=[None, 1]),\n", - " tf.keras.layers.RNN(LNSimpleRNNCell(20), return_sequences=True),\n", - " tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(10))\n", - "])\n", - "\n", - "model.compile(loss=\"mse\", optimizer=\"adam\", metrics=[last_time_step_mse])\n", - "history = model.fit(X_train, Y_train, epochs=20,\n", - " validation_data=(X_valid, Y_valid))" + "tf.random.set_seed(42) # extra code – ensures reproducibility\n", + "custom_ln_model = tf.keras.Sequential([\n", + " tf.keras.layers.RNN(LNSimpleRNNCell(32), return_sequences=True,\n", + " input_shape=[None, 5]),\n", + " tf.keras.layers.Dense(14)\n", + "])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "# Creating a Custom RNN Class" + "Just training for 5 epochs to show that it works (you can increase this if you want):" ] }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 64, + "metadata": {}, + "outputs": [], + "source": [ + "fit_and_evaluate(custom_ln_model, seq2seq_train, seq2seq_valid,\n", + " learning_rate=0.1, epochs=5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Extra Material – Creating a Custom RNN Class" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The RNN class is not magical. In fact, it's not too hard to implement your own RNN class:" + ] + }, + { + "cell_type": "code", + "execution_count": 65, "metadata": {}, "outputs": [], "source": [ @@ -854,11 +1140,16 @@ " super().__init__(**kwargs)\n", " self.cell = cell\n", " self.return_sequences = return_sequences\n", - " self.get_initial_state = getattr(\n", - " self.cell, \"get_initial_state\", self.fallback_initial_state)\n", - " def fallback_initial_state(self, inputs):\n", - " batch_size = tf.shape(inputs)[0]\n", - " return [tf.zeros([batch_size, self.cell.state_size], dtype=inputs.dtype)]\n", + "\n", + " def get_initial_state(self, inputs):\n", + " try:\n", + " return self.cell.get_initial_state(inputs)\n", + " except AttributeError:\n", + " # fallback to zeros if self.cell has no get_initial_state() method\n", + " batch_size = tf.shape(inputs)[0]\n", + " return [tf.zeros([batch_size, self.cell.state_size],\n", + " dtype=inputs.dtype)]\n", + "\n", " @tf.function\n", " def call(self, inputs):\n", " states = self.get_initial_state(inputs)\n", @@ -867,36 +1158,58 @@ " n_steps = shape[1]\n", " sequences = tf.TensorArray(\n", " inputs.dtype, size=(n_steps if self.return_sequences else 0))\n", - " outputs = tf.zeros(shape=[batch_size, self.cell.output_size], dtype=inputs.dtype)\n", + " outputs = tf.zeros(shape=[batch_size, self.cell.output_size],\n", + " dtype=inputs.dtype)\n", " for step in tf.range(n_steps):\n", " outputs, states = self.cell(inputs[:, step], states)\n", " if self.return_sequences:\n", " sequences = sequences.write(step, outputs)\n", + "\n", " if self.return_sequences:\n", + " # stack the outputs into an array of shape\n", + " # [time steps, batch size, dims], then transpose it to shape\n", + " # [batch size, time steps, dims]\n", " return tf.transpose(sequences.stack(), [1, 0, 2])\n", " else:\n", " return outputs" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note that `@tf.function` requires the `outputs` variable to be created before the `for` loop, which is why we initialize its value to a zero tensor, even though we don't use that value at all. Once the function is converted to a graph, this unused value will be pruned from the graph, so it doesn't impact performance. Similarly, `@tf.function` requires the `sequences` variable to be created before the `if` statement where it is used, even if `self.return_sequences` is `False`, so we create a `TensorArray` of size 0 in this case." + ] + }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 66, "metadata": {}, "outputs": [], "source": [ - "np.random.seed(42)\n", "tf.random.set_seed(42)\n", "\n", - "model = tf.keras.Sequential([\n", - " MyRNN(LNSimpleRNNCell(20), return_sequences=True,\n", - " input_shape=[None, 1]),\n", - " MyRNN(LNSimpleRNNCell(20), return_sequences=True),\n", - " tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(10))\n", - "])\n", - "\n", - "model.compile(loss=\"mse\", optimizer=\"adam\", metrics=[last_time_step_mse])\n", - "history = model.fit(X_train, Y_train, epochs=20,\n", - " validation_data=(X_valid, Y_valid))" + "custom_model = tf.keras.Sequential([\n", + " MyRNN(LNSimpleRNNCell(32), return_sequences=True, input_shape=[None, 5]),\n", + " tf.keras.layers.Dense(14)\n", + "])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Just training for 5 epochs to show that it works (you can increase this if you want):" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "metadata": {}, + "outputs": [], + "source": [ + "fit_and_evaluate(custom_model, seq2seq_train, seq2seq_valid,\n", + " learning_rate=0.1, epochs=5)" ] }, { @@ -908,68 +1221,34 @@ }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 68, "metadata": { "scrolled": true }, "outputs": [], "source": [ - "np.random.seed(42)\n", - "tf.random.set_seed(42)\n", - "\n", - "model = tf.keras.Sequential([\n", - " tf.keras.layers.LSTM(20, return_sequences=True, input_shape=[None, 1]),\n", - " tf.keras.layers.LSTM(20, return_sequences=True),\n", - " tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(10))\n", - "])\n", - "\n", - "model.compile(loss=\"mse\", optimizer=\"adam\", metrics=[last_time_step_mse])\n", - "history = model.fit(X_train, Y_train, epochs=20,\n", - " validation_data=(X_valid, Y_valid))" + "tf.random.set_seed(42) # extra code – ensures reproducibility\n", + "lstm_model = tf.keras.models.Sequential([\n", + " tf.keras.layers.LSTM(32, return_sequences=True, input_shape=[None, 5]),\n", + " tf.keras.layers.Dense(14)\n", + "])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Just training for 5 epochs to show that it works (you can increase this if you want):" ] }, { "cell_type": "code", - "execution_count": 48, + "execution_count": 69, "metadata": {}, "outputs": [], "source": [ - "model.evaluate(X_valid, Y_valid)" - ] - }, - { - "cell_type": "code", - "execution_count": 49, - "metadata": {}, - "outputs": [], - "source": [ - "plot_learning_curves(history.history[\"loss\"], history.history[\"val_loss\"])\n", - "plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": 50, - "metadata": {}, - "outputs": [], - "source": [ - "np.random.seed(43)\n", - "\n", - "series = generate_time_series(1, 50 + 10)\n", - "X_new, Y_new = series[:, :50, :], series[:, 50:, :]\n", - "Y_pred = model.predict(X_new)[:, -1][..., np.newaxis]" - ] - }, - { - "cell_type": "code", - "execution_count": 51, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "plot_multiple_forecasts(X_new, Y_new, Y_pred)\n", - "plt.show()" + "fit_and_evaluate(lstm_model, seq2seq_train, seq2seq_valid,\n", + " learning_rate=0.1, epochs=5)" ] }, { @@ -981,66 +1260,34 @@ }, { "cell_type": "code", - "execution_count": 52, - "metadata": {}, - "outputs": [], - "source": [ - "np.random.seed(42)\n", - "tf.random.set_seed(42)\n", - "\n", - "model = tf.keras.Sequential([\n", - " tf.keras.layers.GRU(20, return_sequences=True, input_shape=[None, 1]),\n", - " tf.keras.layers.GRU(20, return_sequences=True),\n", - " tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(10))\n", - "])\n", - "\n", - "model.compile(loss=\"mse\", optimizer=\"adam\", metrics=[last_time_step_mse])\n", - "history = model.fit(X_train, Y_train, epochs=20,\n", - " validation_data=(X_valid, Y_valid))" - ] - }, - { - "cell_type": "code", - "execution_count": 53, - "metadata": {}, - "outputs": [], - "source": [ - "model.evaluate(X_valid, Y_valid)" - ] - }, - { - "cell_type": "code", - "execution_count": 54, - "metadata": {}, - "outputs": [], - "source": [ - "plot_learning_curves(history.history[\"loss\"], history.history[\"val_loss\"])\n", - "plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": 55, - "metadata": {}, - "outputs": [], - "source": [ - "np.random.seed(43)\n", - "\n", - "series = generate_time_series(1, 50 + 10)\n", - "X_new, Y_new = series[:, :50, :], series[:, 50:, :]\n", - "Y_pred = model.predict(X_new)[:, -1][..., np.newaxis]" - ] - }, - { - "cell_type": "code", - "execution_count": 56, + "execution_count": 70, "metadata": { "scrolled": true }, "outputs": [], "source": [ - "plot_multiple_forecasts(X_new, Y_new, Y_pred)\n", - "plt.show()" + "tf.random.set_seed(42) # extra code – ensures reproducibility\n", + "gru_model = tf.keras.Sequential([\n", + " tf.keras.layers.GRU(32, return_sequences=True, input_shape=[None, 5]),\n", + " tf.keras.layers.Dense(14)\n", + "])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Just training for 5 epochs to show that it works (you can increase this if you want):" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "metadata": {}, + "outputs": [], + "source": [ + "fit_and_evaluate(gru_model, seq2seq_train, seq2seq_valid,\n", + " learning_rate=0.1, epochs=5)" ] }, { @@ -1055,42 +1302,51 @@ "metadata": {}, "source": [ "```\n", - "1D conv layer with kernel size 4, stride 2, VALID padding:\n", - "\n", - " |-----2-----| |-----5---...------| |-----23----|\n", - " |-----1-----| |-----4-----| ... |-----22----|\n", - " |-----0----| |-----3-----| |---...|-----21----|\n", - "X: 0 1 2 3 4 5 6 7 8 9 10 11 12 ... 42 43 44 45 46 47 48 49\n", - "Y: 1 2 3 4 5 6 7 8 9 10 11 12 13 ... 43 44 45 46 47 48 49 50\n", - " /10 11 12 13 14 15 16 17 18 19 20 21 22 ... 52 53 54 55 56 57 58 59\n", - "\n", - "Output:\n", - "\n", - "X: 0/3 2/5 4/7 6/9 8/11 10/13 .../43 42/45 44/47 46/49\n", - "Y: 4/13 6/15 8/17 10/19 12/21 14/23 .../53 46/55 48/57 50/59\n", + " |-----0-----| |-----3----| |--... |-------52------|\n", + " |-----1----| |-----4----| ... | |-------53------|\n", + " |-----2----| |------5--...-51------| |-------54------|\n", + "X: 0 1 2 3 4 5 6 7 8 9 10 11 12 ... 104 105 106 107 108 109 110 111\n", + "Y: from 4 6 8 10 12 ... 106 108 110 112\n", + " to 17 19 21 23 25 ... 119 121 123 125\n", "```" ] }, { "cell_type": "code", - "execution_count": 57, + "execution_count": 72, "metadata": {}, "outputs": [], "source": [ - "np.random.seed(42)\n", - "tf.random.set_seed(42)\n", - "\n", - "model = tf.keras.Sequential([\n", - " tf.keras.layers.Conv1D(filters=20, kernel_size=4, strides=2, padding=\"valid\",\n", - " input_shape=[None, 1]),\n", - " tf.keras.layers.GRU(20, return_sequences=True),\n", - " tf.keras.layers.GRU(20, return_sequences=True),\n", - " tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(10))\n", + "tf.random.set_seed(42) # extra code – ensures reproducibility\n", + "conv_rnn_model = tf.keras.Sequential([\n", + " tf.keras.layers.Conv1D(filters=32, kernel_size=4, strides=2,\n", + " activation=\"relu\", input_shape=[None, 5]),\n", + " tf.keras.layers.GRU(32, return_sequences=True),\n", + " tf.keras.layers.Dense(14)\n", "])\n", "\n", - "model.compile(loss=\"mse\", optimizer=\"adam\", metrics=[last_time_step_mse])\n", - "history = model.fit(X_train, Y_train[:, 3::2], epochs=20,\n", - " validation_data=(X_valid, Y_valid[:, 3::2]))" + "longer_train = to_seq2seq_dataset(mulvar_train, seq_length=112,\n", + " shuffle=True, seed=42)\n", + "longer_valid = to_seq2seq_dataset(mulvar_valid, seq_length=112)\n", + "downsampled_train = longer_train.map(lambda X, Y: (X, Y[:, 3::2]))\n", + "downsampled_valid = longer_valid.map(lambda X, Y: (X, Y[:, 3::2]))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Just training for 5 epochs to show that it works (you can increase this if you want):" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "metadata": {}, + "outputs": [], + "source": [ + "fit_and_evaluate(conv_rnn_model, downsampled_train, downsampled_valid,\n", + " learning_rate=0.1, epochs=5)" ] }, { @@ -1105,34 +1361,55 @@ "metadata": {}, "source": [ "```\n", - "C2 /\\ /\\ /\\ /\\ /\\ /\\ /\\ /\\ /\\ /\\ /\\ /\\ /\\.../\\ /\\ /\\ /\\ /\\ /\\\n", - " \\ / \\ / \\ / \\ / \\ / \\ / \\ / \\ / \\ / \\\n", - " / \\ / \\ / \\ / \\\n", - "C1 /\\ /\\ /\\ /\\ /\\ /\\ /\\ /\\ /\\ /\\ /\\ /\\ /.../\\ /\\ /\\ /\\ /\\ /\\ /\\\n", - "X: 0 1 2 3 4 5 6 7 8 9 10 11 12 ... 43 44 45 46 47 48 49\n", - "Y: 1 2 3 4 5 6 7 8 9 10 11 12 13 ... 44 45 46 47 48 49 50\n", - " /10 11 12 13 14 15 16 17 18 19 20 21 22 ... 53 54 55 56 57 58 59\n", + " ⋮\n", + "C2 /\\ /\\ /\\ /\\ /\\ /\\ /\\ /\\ /\\ /\\ /\\ /\\ /\\...\n", + " \\ / \\ / \\ / \\ / \\ / \\ / \\ \n", + " / \\ / \\ / \\ \n", + "C1 /\\ /\\ /\\ /\\ /\\ /\\ /\\ /\\ /\\ /\\ /\\ /\\ /...\\\n", + "X: 0 1 2 3 4 5 6 7 8 9 10 11 12 ... 111\n", + "Y: 1 2 3 4 5 6 7 8 9 10 11 12 13 ... 112\n", + " /14 15 16 17 18 19 20 21 22 23 24 25 26 ... 125\n", "```" ] }, { "cell_type": "code", - "execution_count": 58, + "execution_count": 74, "metadata": {}, "outputs": [], "source": [ - "np.random.seed(42)\n", - "tf.random.set_seed(42)\n", - "\n", - "model = tf.keras.Sequential()\n", - "model.add(tf.keras.layers.InputLayer(input_shape=[None, 1]))\n", + "tf.random.set_seed(42) # extra code – ensures reproducibility\n", + "wavenet_model = tf.keras.Sequential()\n", + "wavenet_model.add(tf.keras.layers.InputLayer(input_shape=[None, 5]))\n", "for rate in (1, 2, 4, 8) * 2:\n", - " model.add(tf.keras.layers.Conv1D(filters=20, kernel_size=2, padding=\"causal\",\n", - " activation=\"relu\", dilation_rate=rate))\n", - "model.add(tf.keras.layers.Conv1D(filters=10, kernel_size=1))\n", - "model.compile(loss=\"mse\", optimizer=\"adam\", metrics=[last_time_step_mse])\n", - "history = model.fit(X_train, Y_train, epochs=20,\n", - " validation_data=(X_valid, Y_valid))" + " wavenet_model.add(tf.keras.layers.Conv1D(\n", + " filters=32, kernel_size=2, padding=\"causal\", activation=\"relu\",\n", + " dilation_rate=rate))\n", + "wavenet_model.add(tf.keras.layers.Conv1D(filters=14, kernel_size=1))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Just training for 5 epochs to show that it works (you can increase this if you want):" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "metadata": {}, + "outputs": [], + "source": [ + "fit_and_evaluate(wavenet_model, longer_train, longer_valid,\n", + " learning_rate=0.1, epochs=5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Extra Material – Wavenet Implementation" ] }, { @@ -1144,7 +1421,7 @@ }, { "cell_type": "code", - "execution_count": 59, + "execution_count": 76, "metadata": {}, "outputs": [], "source": [ @@ -1152,6 +1429,7 @@ " def __init__(self, activation=\"tanh\", **kwargs):\n", " super().__init__(**kwargs)\n", " self.activation = tf.keras.activations.get(activation)\n", + "\n", " def call(self, inputs):\n", " n_filters = inputs.shape[-1] // 2\n", " linear_output = self.activation(inputs[..., :n_filters])\n", @@ -1161,7 +1439,7 @@ }, { "cell_type": "code", - "execution_count": 60, + "execution_count": 77, "metadata": {}, "outputs": [], "source": [ @@ -1175,41 +1453,46 @@ }, { "cell_type": "code", - "execution_count": 61, + "execution_count": 78, "metadata": {}, "outputs": [], "source": [ - "tf.keras.backend.clear_session()\n", - "np.random.seed(42)\n", "tf.random.set_seed(42)\n", "\n", - "n_layers_per_block = 3 # 10 in the paper\n", - "n_blocks = 1 # 3 in the paper\n", - "n_filters = 32 # 128 in the paper\n", - "n_outputs = 10 # 256 in the paper\n", + "n_layers_per_block = 3 # 10 in the paper\n", + "n_blocks = 1 # 3 in the paper\n", + "n_filters = 32 # 128 in the paper\n", + "n_outputs = 14 # 256 in the paper\n", "\n", - "inputs = tf.keras.layers.Input(shape=[None, 1])\n", + "inputs = tf.keras.layers.Input(shape=[None, 5])\n", "z = tf.keras.layers.Conv1D(n_filters, kernel_size=2, padding=\"causal\")(inputs)\n", "skip_to_last = []\n", "for dilation_rate in [2**i for i in range(n_layers_per_block)] * n_blocks:\n", " z, skip = wavenet_residual_block(z, n_filters, dilation_rate)\n", " skip_to_last.append(skip)\n", + "\n", "z = tf.keras.activations.relu(tf.keras.layers.Add()(skip_to_last))\n", "z = tf.keras.layers.Conv1D(n_filters, kernel_size=1, activation=\"relu\")(z)\n", - "Y_proba = tf.keras.layers.Conv1D(n_outputs, kernel_size=1, activation=\"softmax\")(z)\n", + "Y_preds = tf.keras.layers.Conv1D(n_outputs, kernel_size=1)(z)\n", "\n", - "model = tf.keras.Model(inputs=[inputs], outputs=[Y_proba])" + "full_wavenet_model = tf.keras.Model(inputs=[inputs], outputs=[Y_preds])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Just training for 5 epochs to show that it works (you can increase this if you want):" ] }, { "cell_type": "code", - "execution_count": 62, + "execution_count": 79, "metadata": {}, "outputs": [], "source": [ - "model.compile(loss=\"mse\", optimizer=\"adam\", metrics=[last_time_step_mse])\n", - "history = model.fit(X_train, Y_train, epochs=2,\n", - " validation_data=(X_valid, Y_valid))" + "fit_and_evaluate(full_wavenet_model, longer_train, longer_valid,\n", + " learning_rate=0.1, epochs=5)" ] }, { @@ -1237,7 +1520,17 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "See Appendix A." + "1. Here are a few RNN applications:\n", + " * For a sequence-to-sequence RNN: predicting the weather (or any other time series), machine translation (using an Encoder–Decoder architecture), video captioning, speech to text, music generation (or other sequence generation), identifying the chords of a song\n", + " * For a sequence-to-vector RNN: classifying music samples by music genre, analyzing the sentiment of a book review, predicting what word an aphasic patient is thinking of based on readings from brain implants, predicting the probability that a user will want to watch a movie based on their watch history (this is one of many possible implementations of _collaborative filtering_ for a recommender system)\n", + " * For a vector-to-sequence RNN: image captioning, creating a music playlist based on an embedding of the current artist, generating a melody based on a set of parameters, locating pedestrians in a picture (e.g., a video frame from a self-driving car's camera)\n", + "2. An RNN layer must have three-dimensional inputs: the first dimension is the batch dimension (its size is the batch size), the second dimension represents the time (its size is the number of time steps), and the third dimension holds the inputs at each time step (its size is the number of input features per time step). For example, if you want to process a batch containing 5 time series of 10 time steps each, with 2 values per time step (e.g., the temperature and the wind speed), the shape will be [5, 10, 2]. The outputs are also three-dimensional, with the same first two dimensions, but the last dimension is equal to the number of neurons. For example, if an RNN layer with 32 neurons processes the batch we just discussed, the output will have a shape of [5, 10, 32].\n", + "3. To build a deep sequence-to-sequence RNN using Keras, you must set `return_sequences=True` for all RNN layers. To build a sequence-to-vector RNN, you must set `return_sequences=True` for all RNN layers except for the top RNN layer, which must have `return_sequences=False` (or do not set this argument at all, since `False` is the default).\n", + "4. If you have a daily univariate time series, and you want to forecast the next seven days, the simplest RNN architecture you can use is a stack of RNN layers (all with `return_sequences=True` except for the top RNN layer), using seven neurons in the output RNN layer. You can then train this model using random windows from the time series (e.g., sequences of 30 consecutive days as the inputs, and a vector containing the values of the next 7 days as the target). This is a sequence-to-vector RNN. Alternatively, you could set `return_sequences=True` for all RNN layers to create a sequence-to-sequence RNN. You can train this model using random windows from the time series, with sequences of the same length as the inputs as the targets. Each target sequence should have seven values per time step (e.g., for time step _t_, the target should be a vector containing the values at time steps _t_ + 1 to _t_ + 7).\n", + "5. The two main difficulties when training RNNs are unstable gradients (exploding or vanishing) and a very limited short-term memory. These problems both get worse when dealing with long sequences. To alleviate the unstable gradients problem, you can use a smaller learning rate, use a saturating activation function such as the hyperbolic tangent (which is the default), and possibly use gradient clipping, Layer Normalization, or dropout at each time step. To tackle the limited short-term memory problem, you can use `LSTM` or `GRU` layers (this also helps with the unstable gradients problem).\n", + "6. An LSTM cell's architecture looks complicated, but it's actually not too hard if you understand the underlying logic. The cell has a short-term state vector and a long-term state vector. At each time step, the inputs and the previous short-term state are fed to a simple RNN cell and three gates: the forget gate decides what to remove from the long-term state, the input gate decides which part of the output of the simple RNN cell should be added to the long-term state, and the output gate decides which part of the long-term state should be output at this time step (after going through the tanh activation function). The new short-term state is equal to the output of the cell. See Figure 15–12.\n", + "7. An RNN layer is fundamentally sequential: in order to compute the outputs at time step _t_, it has to first compute the outputs at all earlier time steps. This makes it impossible to parallelize. On the other hand, a 1D convolutional layer lends itself well to parallelization since it does not hold a state between time steps. In other words, it has no memory: the output at any time step can be computed based only on a small window of values from the inputs without having to know all the past values. Moreover, since a 1D convolutional layer is not recurrent, it suffers less from unstable gradients. One or more 1D convolutional layers can be useful in an RNN to efficiently preprocess the inputs, for example to reduce their temporal resolution (downsampling) and thereby help the RNN layers detect long-term patterns. In fact, it is possible to use only convolutional layers, for example by building a WaveNet architecture.\n", + "8. To classify videos based on their visual content, one possible architecture could be to take (say) one frame per second, then run every frame through the same convolutional neural network (e.g., a pretrained Xception model, possibly frozen if your dataset is not large), feed the sequence of outputs from the CNN to a sequence-to-vector RNN, and finally run its output through a softmax layer, giving you all the class probabilities. For training you would use cross entropy as the cost function. If you wanted to use the audio for classification as well, you could use a stack of strided 1D convolutional layers to reduce the temporal resolution from thousands of audio frames per second to just one per second (to match the number of images per second), and concatenate the output sequence to the inputs of the sequence-to-vector RNN (along the last dimension)." ] }, { @@ -1263,32 +1556,36 @@ }, { "cell_type": "code", - "execution_count": 63, + "execution_count": 80, "metadata": {}, "outputs": [], "source": [ "DOWNLOAD_ROOT = \"http://download.tensorflow.org/data/\"\n", "FILENAME = \"quickdraw_tutorial_dataset_v1.tar.gz\"\n", "filepath = tf.keras.utils.get_file(FILENAME,\n", - " DOWNLOAD_ROOT + FILENAME,\n", - " cache_subdir=\"datasets/quickdraw\",\n", - " extract=True)" + " DOWNLOAD_ROOT + FILENAME,\n", + " cache_subdir=\"datasets/quickdraw\",\n", + " extract=True)" ] }, { "cell_type": "code", - "execution_count": 64, + "execution_count": 81, "metadata": {}, "outputs": [], "source": [ "quickdraw_dir = Path(filepath).parent\n", - "train_files = sorted([str(path) for path in quickdraw_dir.glob(\"training.tfrecord-*\")])\n", - "eval_files = sorted([str(path) for path in quickdraw_dir.glob(\"eval.tfrecord-*\")])" + "train_files = sorted(\n", + " [str(path) for path in quickdraw_dir.glob(\"training.tfrecord-*\")]\n", + ")\n", + "eval_files = sorted(\n", + " [str(path) for path in quickdraw_dir.glob(\"eval.tfrecord-*\")]\n", + ")" ] }, { "cell_type": "code", - "execution_count": 65, + "execution_count": 82, "metadata": {}, "outputs": [], "source": [ @@ -1297,7 +1594,7 @@ }, { "cell_type": "code", - "execution_count": 66, + "execution_count": 83, "metadata": {}, "outputs": [], "source": [ @@ -1306,7 +1603,7 @@ }, { "cell_type": "code", - "execution_count": 67, + "execution_count": 84, "metadata": {}, "outputs": [], "source": [ @@ -1319,7 +1616,7 @@ }, { "cell_type": "code", - "execution_count": 68, + "execution_count": 85, "metadata": {}, "outputs": [], "source": [ @@ -1329,7 +1626,7 @@ }, { "cell_type": "code", - "execution_count": 69, + "execution_count": 86, "metadata": {}, "outputs": [], "source": [ @@ -1338,7 +1635,7 @@ }, { "cell_type": "code", - "execution_count": 70, + "execution_count": 87, "metadata": {}, "outputs": [], "source": [ @@ -1358,7 +1655,7 @@ }, { "cell_type": "code", - "execution_count": 71, + "execution_count": 88, "metadata": {}, "outputs": [], "source": [ @@ -1377,7 +1674,7 @@ }, { "cell_type": "code", - "execution_count": 72, + "execution_count": 89, "metadata": {}, "outputs": [], "source": [ @@ -1388,7 +1685,7 @@ }, { "cell_type": "code", - "execution_count": 73, + "execution_count": 90, "metadata": {}, "outputs": [], "source": [ @@ -1400,7 +1697,7 @@ }, { "cell_type": "code", - "execution_count": 74, + "execution_count": 91, "metadata": {}, "outputs": [], "source": [ @@ -1408,7 +1705,7 @@ " origin = np.array([[0., 0., 0.]])\n", " sketch = np.r_[origin, sketch]\n", " stroke_end_indices = np.argwhere(sketch[:, -1]==1.)[:, 0]\n", - " coordinates = np.cumsum(sketch[:, :2], axis=0)\n", + " coordinates = sketch[:, :2].cumsum(axis=0)\n", " strokes = np.split(coordinates, stroke_end_indices + 1)\n", " title = class_names[label.numpy()] if label is not None else \"Try to guess\"\n", " plt.title(title)\n", @@ -1440,7 +1737,7 @@ }, { "cell_type": "code", - "execution_count": 75, + "execution_count": 92, "metadata": {}, "outputs": [], "source": [ @@ -1454,7 +1751,7 @@ }, { "cell_type": "code", - "execution_count": 76, + "execution_count": 93, "metadata": {}, "outputs": [], "source": [ @@ -1468,7 +1765,7 @@ }, { "cell_type": "code", - "execution_count": 77, + "execution_count": 94, "metadata": {}, "outputs": [], "source": [ @@ -1493,7 +1790,7 @@ }, { "cell_type": "code", - "execution_count": 78, + "execution_count": 95, "metadata": {}, "outputs": [], "source": [ @@ -1503,7 +1800,7 @@ }, { "cell_type": "code", - "execution_count": 79, + "execution_count": 96, "metadata": {}, "outputs": [], "source": [ @@ -1512,7 +1809,7 @@ }, { "cell_type": "code", - "execution_count": 80, + "execution_count": 97, "metadata": {}, "outputs": [], "source": [ @@ -1533,7 +1830,7 @@ }, { "cell_type": "code", - "execution_count": 81, + "execution_count": 98, "metadata": {}, "outputs": [], "source": [ @@ -1551,11 +1848,12 @@ }, { "cell_type": "code", - "execution_count": 82, + "execution_count": 99, "metadata": {}, "outputs": [], "source": [ - "DOWNLOAD_ROOT = \"https://github.com/ageron/handson-ml2/raw/master/datasets/jsb_chorales/\"\n", + "DOWNLOAD_ROOT = (\"https://raw.githubusercontent.com/ageron/handson-ml3/main/\"\n", + " \"datasets/jsb_chorales/\")\n", "FILENAME = \"jsb_chorales.tgz\"\n", "filepath = tf.keras.utils.get_file(FILENAME,\n", " DOWNLOAD_ROOT + FILENAME,\n", @@ -1565,7 +1863,7 @@ }, { "cell_type": "code", - "execution_count": 83, + "execution_count": 100, "metadata": {}, "outputs": [], "source": [ @@ -1577,7 +1875,7 @@ }, { "cell_type": "code", - "execution_count": 84, + "execution_count": 101, "metadata": {}, "outputs": [], "source": [ @@ -1593,7 +1891,7 @@ }, { "cell_type": "code", - "execution_count": 85, + "execution_count": 102, "metadata": {}, "outputs": [], "source": [ @@ -1609,7 +1907,7 @@ }, { "cell_type": "code", - "execution_count": 86, + "execution_count": 103, "metadata": {}, "outputs": [], "source": [ @@ -1636,7 +1934,7 @@ }, { "cell_type": "code", - "execution_count": 87, + "execution_count": 104, "metadata": {}, "outputs": [], "source": [ @@ -1651,7 +1949,7 @@ " note_duration = 60 / tempo # the tempo is measured in beats per minutes\n", " # To reduce click sound at every beat, we round the frequencies to try to\n", " # get the samples close to zero at the end of each note.\n", - " frequencies = np.round(note_duration * frequencies) / note_duration\n", + " frequencies = (note_duration * frequencies).round() / note_duration\n", " n_samples = int(note_duration * sample_rate)\n", " time = np.linspace(0, note_duration, n_samples)\n", " sine_waves = np.sin(2 * np.pi * frequencies.reshape(-1, 1) * time)\n", @@ -1689,7 +1987,7 @@ }, { "cell_type": "code", - "execution_count": 88, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -1719,7 +2017,7 @@ }, { "cell_type": "code", - "execution_count": 89, + "execution_count": 106, "metadata": {}, "outputs": [], "source": [ @@ -1763,7 +2061,7 @@ }, { "cell_type": "code", - "execution_count": 90, + "execution_count": 107, "metadata": {}, "outputs": [], "source": [ @@ -1778,7 +2076,7 @@ "source": [ "Now let's create the model:\n", "\n", - "* We could feed the note values directly to the model, as floats, but this would probably not give good results. Indeed, the relationships between notes are not that simple: for example, if you replace a C3 with a C4, the melody will still sound fine, even though these notes are 12 semi-tones apart (i.e., one octave). Conversely, if you replace a C3 with a C\\#3, it's very likely that the chord will sound horrible, despite these notes being just next to each other. So we will use an `Embedding` layer to convert each note to a small vector representation (see Chapter 15 for more details on embeddings). We will use 5-dimensional embeddings, so the output of this first layer will have a shape of `[batch_size, window_size, 5]`.\n", + "* We could feed the note values directly to the model, as floats, but this would probably not give good results. Indeed, the relationships between notes are not that simple: for example, if you replace a C3 with a C4, the melody will still sound fine, even though these notes are 12 semi-tones apart (i.e., one octave). Conversely, if you replace a C3 with a C\\#3, it's very likely that the chord will sound horrible, despite these notes being just next to each other. So we will use an `Embedding` layer to convert each note to a small vector representation (see Chapter 16 for more details on embeddings). We will use 5-dimensional embeddings, so the output of this first layer will have a shape of `[batch_size, window_size, 5]`.\n", "* We will then feed this data to a small WaveNet-like neural network, composed of a stack of 4 `Conv1D` layers with doubling dilation rates. We will intersperse these layers with `BatchNormalization` layers for faster better convergence.\n", "* Then one `LSTM` layer to try to capture long-term patterns.\n", "* And finally a `Dense` layer to produce the final note probabilities. It will predict one probability for each chorale in the batch, for each time step, and for each possible note (including silence). So the output shape will be `[batch_size, window_size, 47]`." @@ -1786,7 +2084,7 @@ }, { "cell_type": "code", - "execution_count": 91, + "execution_count": 108, "metadata": {}, "outputs": [], "source": [ @@ -1819,7 +2117,7 @@ }, { "cell_type": "code", - "execution_count": 92, + "execution_count": 109, "metadata": {}, "outputs": [], "source": [ @@ -1845,7 +2143,7 @@ }, { "cell_type": "code", - "execution_count": 93, + "execution_count": 110, "metadata": {}, "outputs": [], "source": [ @@ -1871,12 +2169,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "**Warning**: `model.predict_classes(X)` is deprecated. It is replaced with `np.argmax(model.predict(X), axis=-1)`." + "**Warning**: `model.predict_classes(X)` is deprecated. It is replaced with `model.predict(X).argmax(axis=-1)`." ] }, { "cell_type": "code", - "execution_count": 94, + "execution_count": 111, "metadata": {}, "outputs": [], "source": [ @@ -1886,7 +2184,7 @@ " for chord in range(length):\n", " for note in range(4):\n", " #next_note = model.predict_classes(arpegio)[:1, -1:]\n", - " next_note = np.argmax(model.predict(arpegio), axis=-1)[:1, -1:]\n", + " next_note = model.predict(arpegio).argmax(axis=-1)[:1, -1:]\n", " arpegio = tf.concat([arpegio, next_note], axis=1)\n", " arpegio = tf.where(arpegio == 0, arpegio, arpegio + min_note - 1)\n", " return tf.reshape(arpegio, shape=[-1, 4])" @@ -1901,7 +2199,7 @@ }, { "cell_type": "code", - "execution_count": 95, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -1918,7 +2216,7 @@ }, { "cell_type": "code", - "execution_count": 96, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -1937,7 +2235,7 @@ }, { "cell_type": "code", - "execution_count": 97, + "execution_count": 114, "metadata": {}, "outputs": [], "source": [ @@ -1965,7 +2263,7 @@ }, { "cell_type": "code", - "execution_count": 98, + "execution_count": null, "metadata": { "scrolled": true }, @@ -1977,7 +2275,7 @@ }, { "cell_type": "code", - "execution_count": 99, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -1987,7 +2285,7 @@ }, { "cell_type": "code", - "execution_count": 100, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -2004,7 +2302,7 @@ }, { "cell_type": "code", - "execution_count": 101, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -2014,7 +2312,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python 3", "language": "python", "name": "python3" },