handson-ml/10_neural_nets_with_keras.i...

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Chapter 10 – Introduction to Artificial Neural Networks with Keras**\n",
    "\n",
    "_This notebook contains all the sample code and solutions to the exercises in chapter 10._"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<table align=\"left\">\n",
    "  <td>\n",
    "    <a target=\"_blank\" href=\"https://colab.research.google.com/github/ageron/handson-ml2/blob/master/10_neural_nets_with_keras.ipynb\"><img src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" />Run in Google Colab</a>\n",
    "  </td>\n",
    "</table>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Setup"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "First, let's import a few common modules, ensure MatplotLib plots figures inline and prepare a function to save the figures. We also check that Python 3.5 or later is installed (although Python 2.x may work, it is deprecated so we strongly recommend you use Python 3 instead), as well as Scikit-Learn ≥0.20 and TensorFlow ≥2.0."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Python ≥3.5 is required\n",
    "import sys\n",
    "assert sys.version_info >= (3, 5)\n",
    "\n",
    "# Scikit-Learn ≥0.20 is required\n",
    "import sklearn\n",
    "assert sklearn.__version__ >= \"0.20\"\n",
    "\n",
    "try:\n",
    "    # %tensorflow_version only exists in Colab.\n",
    "    %tensorflow_version 2.x\n",
    "except Exception:\n",
    "    pass\n",
    "\n",
    "# TensorFlow ≥2.0 is required\n",
    "import tensorflow as tf\n",
    "assert tf.__version__ >= \"2.0\"\n",
    "\n",
    "# Common imports\n",
    "import numpy as np\n",
    "import os\n",
    "\n",
    "# to make this notebook's output stable across runs\n",
    "np.random.seed(42)\n",
    "\n",
    "# To plot pretty figures\n",
    "%matplotlib inline\n",
    "import matplotlib as mpl\n",
    "import matplotlib.pyplot as plt\n",
    "mpl.rc('axes', labelsize=14)\n",
    "mpl.rc('xtick', labelsize=12)\n",
    "mpl.rc('ytick', labelsize=12)\n",
    "\n",
    "# Where to save the figures\n",
    "PROJECT_ROOT_DIR = \".\"\n",
    "CHAPTER_ID = \"ann\"\n",
    "IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, \"images\", CHAPTER_ID)\n",
    "os.makedirs(IMAGES_PATH, exist_ok=True)\n",
    "\n",
    "def save_fig(fig_id, tight_layout=True, fig_extension=\"png\", resolution=300):\n",
    "    path = os.path.join(IMAGES_PATH, fig_id + \".\" + fig_extension)\n",
    "    print(\"Saving figure\", fig_id)\n",
    "    if tight_layout:\n",
    "        plt.tight_layout()\n",
    "    plt.savefig(path, format=fig_extension, dpi=resolution)\n",
    "\n",
    "# Ignore useless warnings (see SciPy issue #5998)\n",
    "import warnings\n",
    "warnings.filterwarnings(action=\"ignore\", message=\"^internal gelsd\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Perceptrons"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Note**: we set `max_iter` and `tol` explicitly to avoid warnings about the fact that their default value will change in future versions of Scikit-Learn."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "from sklearn.datasets import load_iris\n",
    "from sklearn.linear_model import Perceptron\n",
    "\n",
    "iris = load_iris()\n",
    "X = iris.data[:, (2, 3)]  # petal length, petal width\n",
    "y = (iris.target == 0).astype(np.int)\n",
    "\n",
    "per_clf = Perceptron(max_iter=1000, tol=1e-3, random_state=42)\n",
    "per_clf.fit(X, y)\n",
    "\n",
    "y_pred = per_clf.predict([[2, 0.5]])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "a = -per_clf.coef_[0][0] / per_clf.coef_[0][1]\n",
    "b = -per_clf.intercept_ / per_clf.coef_[0][1]\n",
    "\n",
    "axes = [0, 5, 0, 2]\n",
    "\n",
    "x0, x1 = np.meshgrid(\n",
    "        np.linspace(axes[0], axes[1], 500).reshape(-1, 1),\n",
    "        np.linspace(axes[2], axes[3], 200).reshape(-1, 1),\n",
    "    )\n",
    "X_new = np.c_[x0.ravel(), x1.ravel()]\n",
    "y_predict = per_clf.predict(X_new)\n",
    "zz = y_predict.reshape(x0.shape)\n",
    "\n",
    "plt.figure(figsize=(10, 4))\n",
    "plt.plot(X[y==0, 0], X[y==0, 1], \"bs\", label=\"Not Iris-Setosa\")\n",
    "plt.plot(X[y==1, 0], X[y==1, 1], \"yo\", label=\"Iris-Setosa\")\n",
    "\n",
    "plt.plot([axes[0], axes[1]], [a * axes[0] + b, a * axes[1] + b], \"k-\", linewidth=3)\n",
    "from matplotlib.colors import ListedColormap\n",
    "custom_cmap = ListedColormap(['#9898ff', '#fafab0'])\n",
    "\n",
    "plt.contourf(x0, x1, zz, cmap=custom_cmap)\n",
    "plt.xlabel(\"Petal length\", fontsize=14)\n",
    "plt.ylabel(\"Petal width\", fontsize=14)\n",
    "plt.legend(loc=\"lower right\", fontsize=14)\n",
    "plt.axis(axes)\n",
    "\n",
    "save_fig(\"perceptron_iris_plot\")\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Activation functions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "def sigmoid(z):\n",
    "    return 1 / (1 + np.exp(-z))\n",
    "\n",
    "def relu(z):\n",
    "    return np.maximum(0, z)\n",
    "\n",
    "def derivative(f, z, eps=0.000001):\n",
    "    return (f(z + eps) - f(z - eps))/(2 * eps)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "z = np.linspace(-5, 5, 200)\n",
    "\n",
    "plt.figure(figsize=(11,4))\n",
    "\n",
    "plt.subplot(121)\n",
    "plt.plot(z, np.sign(z), \"r-\", linewidth=1, label=\"Step\")\n",
    "plt.plot(z, sigmoid(z), \"g--\", linewidth=2, label=\"Sigmoid\")\n",
    "plt.plot(z, np.tanh(z), \"b-\", linewidth=2, label=\"Tanh\")\n",
    "plt.plot(z, relu(z), \"m-.\", linewidth=2, label=\"ReLU\")\n",
    "plt.grid(True)\n",
    "plt.legend(loc=\"center right\", fontsize=14)\n",
    "plt.title(\"Activation functions\", fontsize=14)\n",
    "plt.axis([-5, 5, -1.2, 1.2])\n",
    "\n",
    "plt.subplot(122)\n",
    "plt.plot(z, derivative(np.sign, z), \"r-\", linewidth=1, label=\"Step\")\n",
    "plt.plot(0, 0, \"ro\", markersize=5)\n",
    "plt.plot(0, 0, \"rx\", markersize=10)\n",
    "plt.plot(z, derivative(sigmoid, z), \"g--\", linewidth=2, label=\"Sigmoid\")\n",
    "plt.plot(z, derivative(np.tanh, z), \"b-\", linewidth=2, label=\"Tanh\")\n",
    "plt.plot(z, derivative(relu, z), \"m-.\", linewidth=2, label=\"ReLU\")\n",
    "plt.grid(True)\n",
    "#plt.legend(loc=\"center right\", fontsize=14)\n",
    "plt.title(\"Derivatives\", fontsize=14)\n",
    "plt.axis([-5, 5, -0.2, 1.2])\n",
    "\n",
    "save_fig(\"activation_functions_plot\")\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "def heaviside(z):\n",
    "    return (z >= 0).astype(z.dtype)\n",
    "\n",
    "def mlp_xor(x1, x2, activation=heaviside):\n",
    "    return activation(-activation(x1 + x2 - 1.5) + activation(x1 + x2 - 0.5) - 0.5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "x1s = np.linspace(-0.2, 1.2, 100)\n",
    "x2s = np.linspace(-0.2, 1.2, 100)\n",
    "x1, x2 = np.meshgrid(x1s, x2s)\n",
    "\n",
    "z1 = mlp_xor(x1, x2, activation=heaviside)\n",
    "z2 = mlp_xor(x1, x2, activation=sigmoid)\n",
    "\n",
    "plt.figure(figsize=(10,4))\n",
    "\n",
    "plt.subplot(121)\n",
    "plt.contourf(x1, x2, z1)\n",
    "plt.plot([0, 1], [0, 1], \"gs\", markersize=20)\n",
    "plt.plot([0, 1], [1, 0], \"y^\", markersize=20)\n",
    "plt.title(\"Activation function: heaviside\", fontsize=14)\n",
    "plt.grid(True)\n",
    "\n",
    "plt.subplot(122)\n",
    "plt.contourf(x1, x2, z2)\n",
    "plt.plot([0, 1], [0, 1], \"gs\", markersize=20)\n",
    "plt.plot([0, 1], [1, 0], \"y^\", markersize=20)\n",
    "plt.title(\"Activation function: sigmoid\", fontsize=14)\n",
    "plt.grid(True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Building an Image Classifier"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "First let's import TensorFlow and Keras."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "import tensorflow as tf\n",
    "from tensorflow import keras"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "tf.__version__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "keras.__version__"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let's start by loading the fashion MNIST dataset. Keras has a number of functions to load popular datasets in `keras.datasets`. The dataset is already split for you between a training set and a test set, but it can be useful to split the training set further to have a validation set:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "fashion_mnist = keras.datasets.fashion_mnist\n",
    "(X_train_full, y_train_full), (X_test, y_test) = fashion_mnist.load_data()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The training set contains 60,000 grayscale images, each 28x28 pixels:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train_full.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Each pixel intensity is represented as a byte (0 to 255):"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train_full.dtype"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let's split the full training set into a validation set and a (smaller) training set. We also scale the pixel intensities down to the 0-1 range and convert them to floats, by dividing by 255."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_valid, X_train = X_train_full[:5000] / 255., X_train_full[5000:] / 255.\n",
    "y_valid, y_train = y_train_full[:5000], y_train_full[5000:]\n",
    "X_test = X_test / 255."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "You can plot an image using Matplotlib's `imshow()` function, with a `'binary'`\n",
    " color map:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.imshow(X_train[0], cmap=\"binary\")\n",
    "plt.axis('off')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The labels are the class IDs (represented as uint8), from 0 to 9:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_train"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Here are the corresponding class names:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "class_names = [\"T-shirt/top\", \"Trouser\", \"Pullover\", \"Dress\", \"Coat\",\n",
    "               \"Sandal\", \"Shirt\", \"Sneaker\", \"Bag\", \"Ankle boot\"]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "So the first image in the training set is a coat:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "class_names[y_train[0]]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The validation set contains 5,000 images, and the test set contains 10,000 images:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_valid.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_test.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let's take a look at a sample of the images in the dataset:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "n_rows = 4\n",
    "n_cols = 10\n",
    "plt.figure(figsize=(n_cols * 1.2, n_rows * 1.2))\n",
    "for row in range(n_rows):\n",
    "    for col in range(n_cols):\n",
    "        index = n_cols * row + col\n",
    "        plt.subplot(n_rows, n_cols, index + 1)\n",
    "        plt.imshow(X_train[index], cmap=\"binary\", interpolation=\"nearest\")\n",
    "        plt.axis('off')\n",
    "        plt.title(class_names[y_train[index]], fontsize=12)\n",
    "plt.subplots_adjust(wspace=0.2, hspace=0.5)\n",
    "save_fig('fashion_mnist_plot', tight_layout=False)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = keras.models.Sequential()\n",
    "model.add(keras.layers.Flatten(input_shape=[28, 28]))\n",
    "model.add(keras.layers.Dense(300, activation=\"relu\"))\n",
    "model.add(keras.layers.Dense(100, activation=\"relu\"))\n",
    "model.add(keras.layers.Dense(10, activation=\"softmax\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "keras.backend.clear_session()\n",
    "np.random.seed(42)\n",
    "tf.random.set_seed(42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = keras.models.Sequential([\n",
    "    keras.layers.Flatten(input_shape=[28, 28]),\n",
    "    keras.layers.Dense(300, activation=\"relu\"),\n",
    "    keras.layers.Dense(100, activation=\"relu\"),\n",
    "    keras.layers.Dense(10, activation=\"softmax\")\n",
    "])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "model.layers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "model.summary()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "keras.utils.plot_model(model, \"my_fashion_mnist_model.png\", show_shapes=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "hidden1 = model.layers[1]\n",
    "hidden1.name"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "model.get_layer(hidden1.name) is hidden1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "weights, biases = hidden1.get_weights()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "weights"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "weights.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "biases"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "biases.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "model.compile(loss=\"sparse_categorical_crossentropy\",\n",
    "              optimizer=\"sgd\",\n",
    "              metrics=[\"accuracy\"])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This is equivalent to:"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "```python\n",
    "model.compile(loss=keras.losses.sparse_categorical_crossentropy,\n",
    "              optimizer=keras.optimizers.SGD(),\n",
    "              metrics=[keras.metrics.sparse_categorical_accuracy])\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "history = model.fit(X_train, y_train, epochs=30,\n",
    "                    validation_data=(X_valid, y_valid))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
    "history.params"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(history.epoch)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [],
   "source": [
    "history.history.keys()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "pd.DataFrame(history.history).plot(figsize=(8, 5))\n",
    "plt.grid(True)\n",
    "plt.gca().set_ylim(0, 1)\n",
    "save_fig(\"keras_learning_curves_plot\")\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [],
   "source": [
    "model.evaluate(X_test, y_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_new = X_test[:3]\n",
    "y_proba = model.predict(X_new)\n",
    "y_proba.round(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = model.predict_classes(X_new)\n",
    "y_pred"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.array(class_names)[y_pred]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_new = y_test[:3]\n",
    "y_new"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.figure(figsize=(7.2, 2.4))\n",
    "for index, image in enumerate(X_new):\n",
    "    plt.subplot(1, 3, index + 1)\n",
    "    plt.imshow(image, cmap=\"binary\", interpolation=\"nearest\")\n",
    "    plt.axis('off')\n",
    "    plt.title(class_names[y_test[index]], fontsize=12)\n",
    "plt.subplots_adjust(wspace=0.2, hspace=0.5)\n",
    "save_fig('fashion_mnist_images_plot', tight_layout=False)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Regression MLP"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let's load, split and scale the California housing dataset (the original one, not the modified one as in chapter 2):"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.datasets import fetch_california_housing\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "\n",
    "housing = fetch_california_housing()\n",
    "\n",
    "X_train_full, X_test, y_train_full, y_test = train_test_split(housing.data, housing.target, random_state=42)\n",
    "X_train, X_valid, y_train, y_valid = train_test_split(X_train_full, y_train_full, random_state=42)\n",
    "\n",
    "scaler = StandardScaler()\n",
    "X_train = scaler.fit_transform(X_train)\n",
    "X_valid = scaler.transform(X_valid)\n",
    "X_test = scaler.transform(X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.random.seed(42)\n",
    "tf.random.set_seed(42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = keras.models.Sequential([\n",
    "    keras.layers.Dense(30, activation=\"relu\", input_shape=X_train.shape[1:]),\n",
    "    keras.layers.Dense(1)\n",
    "])\n",
    "model.compile(loss=\"mean_squared_error\", optimizer=keras.optimizers.SGD(lr=1e-3))\n",
    "history = model.fit(X_train, y_train, epochs=20, validation_data=(X_valid, y_valid))\n",
    "mse_test = model.evaluate(X_test, y_test)\n",
    "X_new = X_test[:3]\n",
    "y_pred = model.predict(X_new)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.plot(pd.DataFrame(history.history))\n",
    "plt.grid(True)\n",
    "plt.gca().set_ylim(0, 1)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Functional API"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Not all neural network models are simply sequential. Some may have complex topologies. Some may have multiple inputs and/or multiple outputs. For example, a Wide & Deep neural network (see [paper](https://ai.google/research/pubs/pub45413)) connects all or part of the inputs directly to the output layer."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.random.seed(42)\n",
    "tf.random.set_seed(42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [],
   "source": [
    "input_ = keras.layers.Input(shape=X_train.shape[1:])\n",
    "hidden1 = keras.layers.Dense(30, activation=\"relu\")(input_)\n",
    "hidden2 = keras.layers.Dense(30, activation=\"relu\")(hidden1)\n",
    "concat = keras.layers.concatenate([input_, hidden2])\n",
    "output = keras.layers.Dense(1)(concat)\n",
    "model = keras.models.Model(inputs=[input_], outputs=[output])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [],
   "source": [
    "model.summary()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [],
   "source": [
    "model.compile(loss=\"mean_squared_error\", optimizer=keras.optimizers.SGD(lr=1e-3))\n",
    "history = model.fit(X_train, y_train, epochs=20,\n",
    "                    validation_data=(X_valid, y_valid))\n",
    "mse_test = model.evaluate(X_test, y_test)\n",
    "y_pred = model.predict(X_new)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "What if you want to send different subsets of input features through the wide or deep paths? We will send 5 features (features 0 to 4), and 6 through the deep path (features 2 to 7). Note that 3 features will go through both (features 2, 3 and 4)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.random.seed(42)\n",
    "tf.random.set_seed(42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [],
   "source": [
    "input_A = keras.layers.Input(shape=[5], name=\"wide_input\")\n",
    "input_B = keras.layers.Input(shape=[6], name=\"deep_input\")\n",
    "hidden1 = keras.layers.Dense(30, activation=\"relu\")(input_B)\n",
    "hidden2 = keras.layers.Dense(30, activation=\"relu\")(hidden1)\n",
    "concat = keras.layers.concatenate([input_A, hidden2])\n",
    "output = keras.layers.Dense(1, name=\"output\")(concat)\n",
    "model = keras.models.Model(inputs=[input_A, input_B], outputs=[output])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [],
   "source": [
    "model.compile(loss=\"mse\", optimizer=keras.optimizers.SGD(lr=1e-3))\n",
    "\n",
    "X_train_A, X_train_B = X_train[:, :5], X_train[:, 2:]\n",
    "X_valid_A, X_valid_B = X_valid[:, :5], X_valid[:, 2:]\n",
    "X_test_A, X_test_B = X_test[:, :5], X_test[:, 2:]\n",
    "X_new_A, X_new_B = X_test_A[:3], X_test_B[:3]\n",
    "\n",
    "history = model.fit((X_train_A, X_train_B), y_train, epochs=20,\n",
    "                    validation_data=((X_valid_A, X_valid_B), y_valid))\n",
    "mse_test = model.evaluate((X_test_A, X_test_B), y_test)\n",
    "y_pred = model.predict((X_new_A, X_new_B))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Adding an auxiliary output for regularization:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.random.seed(42)\n",
    "tf.random.set_seed(42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [],
   "source": [
    "input_A = keras.layers.Input(shape=[5], name=\"wide_input\")\n",
    "input_B = keras.layers.Input(shape=[6], name=\"deep_input\")\n",
    "hidden1 = keras.layers.Dense(30, activation=\"relu\")(input_B)\n",
    "hidden2 = keras.layers.Dense(30, activation=\"relu\")(hidden1)\n",
    "concat = keras.layers.concatenate([input_A, hidden2])\n",
    "output = keras.layers.Dense(1, name=\"main_output\")(concat)\n",
    "aux_output = keras.layers.Dense(1, name=\"aux_output\")(hidden2)\n",
    "model = keras.models.Model(inputs=[input_A, input_B],\n",
    "                           outputs=[output, aux_output])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [],
   "source": [
    "model.compile(loss=[\"mse\", \"mse\"], loss_weights=[0.9, 0.1], optimizer=keras.optimizers.SGD(lr=1e-3))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [],
   "source": [
    "history = model.fit([X_train_A, X_train_B], [y_train, y_train], epochs=20,\n",
    "                    validation_data=([X_valid_A, X_valid_B], [y_valid, y_valid]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [],
   "source": [
    "total_loss, main_loss, aux_loss = model.evaluate(\n",
    "    [X_test_A, X_test_B], [y_test, y_test])\n",
    "y_pred_main, y_pred_aux = model.predict([X_new_A, X_new_B])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# The subclassing API"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {},
   "outputs": [],
   "source": [
    "class WideAndDeepModel(keras.models.Model):\n",
    "    def __init__(self, units=30, activation=\"relu\", **kwargs):\n",
    "        super().__init__(**kwargs)\n",
    "        self.hidden1 = keras.layers.Dense(units, activation=activation)\n",
    "        self.hidden2 = keras.layers.Dense(units, activation=activation)\n",
    "        self.main_output = keras.layers.Dense(1)\n",
    "        self.aux_output = keras.layers.Dense(1)\n",
    "        \n",
    "    def call(self, inputs):\n",
    "        input_A, input_B = inputs\n",
    "        hidden1 = self.hidden1(input_B)\n",
    "        hidden2 = self.hidden2(hidden1)\n",
    "        concat = keras.layers.concatenate([input_A, hidden2])\n",
    "        main_output = self.main_output(concat)\n",
    "        aux_output = self.aux_output(hidden2)\n",
    "        return main_output, aux_output\n",
    "\n",
    "model = WideAndDeepModel(30, activation=\"relu\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {},
   "outputs": [],
   "source": [
    "model.compile(loss=\"mse\", loss_weights=[0.9, 0.1], optimizer=keras.optimizers.SGD(lr=1e-3))\n",
    "history = model.fit((X_train_A, X_train_B), (y_train, y_train), epochs=10,\n",
    "                    validation_data=((X_valid_A, X_valid_B), (y_valid, y_valid)))\n",
    "total_loss, main_loss, aux_loss = model.evaluate((X_test_A, X_test_B), (y_test, y_test))\n",
    "y_pred_main, y_pred_aux = model.predict((X_new_A, X_new_B))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = WideAndDeepModel(30, activation=\"relu\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Saving and Restoring"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.random.seed(42)\n",
    "tf.random.set_seed(42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = keras.models.Sequential([\n",
    "    keras.layers.Dense(30, activation=\"relu\", input_shape=[8]),\n",
    "    keras.layers.Dense(30, activation=\"relu\"),\n",
    "    keras.layers.Dense(1)\n",
    "])    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {},
   "outputs": [],
   "source": [
    "model.compile(loss=\"mse\", optimizer=keras.optimizers.SGD(lr=1e-3))\n",
    "history = model.fit(X_train, y_train, epochs=10, validation_data=(X_valid, y_valid))\n",
    "mse_test = model.evaluate(X_test, y_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {},
   "outputs": [],
   "source": [
    "model.save(\"my_keras_model.h5\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = keras.models.load_model(\"my_keras_model.h5\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {},
   "outputs": [],
   "source": [
    "model.predict(X_new)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {},
   "outputs": [],
   "source": [
    "model.save_weights(\"my_keras_weights.ckpt\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {},
   "outputs": [],
   "source": [
    "model.load_weights(\"my_keras_weights.ckpt\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Using Callbacks during Training"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {},
   "outputs": [],
   "source": [
    "keras.backend.clear_session()\n",
    "np.random.seed(42)\n",
    "tf.random.set_seed(42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = keras.models.Sequential([\n",
    "    keras.layers.Dense(30, activation=\"relu\", input_shape=[8]),\n",
    "    keras.layers.Dense(30, activation=\"relu\"),\n",
    "    keras.layers.Dense(1)\n",
    "])    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "metadata": {},
   "outputs": [],
   "source": [
    "model.compile(loss=\"mse\", optimizer=keras.optimizers.SGD(lr=1e-3))\n",
    "checkpoint_cb = keras.callbacks.ModelCheckpoint(\"my_keras_model.h5\", save_best_only=True)\n",
    "history = model.fit(X_train, y_train, epochs=10,\n",
    "                    validation_data=(X_valid, y_valid),\n",
    "                    callbacks=[checkpoint_cb])\n",
    "model = keras.models.load_model(\"my_keras_model.h5\") # rollback to best model\n",
    "mse_test = model.evaluate(X_test, y_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "metadata": {},
   "outputs": [],
   "source": [
    "model.compile(loss=\"mse\", optimizer=keras.optimizers.SGD(lr=1e-3))\n",
    "early_stopping_cb = keras.callbacks.EarlyStopping(patience=10,\n",
    "                                                  restore_best_weights=True)\n",
    "history = model.fit(X_train, y_train, epochs=100,\n",
    "                    validation_data=(X_valid, y_valid),\n",
    "                    callbacks=[checkpoint_cb, early_stopping_cb])\n",
    "mse_test = model.evaluate(X_test, y_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "metadata": {},
   "outputs": [],
   "source": [
    "class PrintValTrainRatioCallback(keras.callbacks.Callback):\n",
    "    def on_epoch_end(self, epoch, logs):\n",
    "        print(\"\\nval/train: {:.2f}\".format(logs[\"val_loss\"] / logs[\"loss\"]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "metadata": {},
   "outputs": [],
   "source": [
    "val_train_ratio_cb = PrintValTrainRatioCallback()\n",
    "history = model.fit(X_train, y_train, epochs=1,\n",
    "                    validation_data=(X_valid, y_valid),\n",
    "                    callbacks=[val_train_ratio_cb])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# TensorBoard"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "metadata": {},
   "outputs": [],
   "source": [
    "root_logdir = os.path.join(os.curdir, \"my_logs\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_run_logdir():\n",
    "    import time\n",
    "    run_id = time.strftime(\"run_%Y_%m_%d-%H_%M_%S\")\n",
    "    return os.path.join(root_logdir, run_id)\n",
    "\n",
    "run_logdir = get_run_logdir()\n",
    "run_logdir"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "metadata": {},
   "outputs": [],
   "source": [
    "keras.backend.clear_session()\n",
    "np.random.seed(42)\n",
    "tf.random.set_seed(42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = keras.models.Sequential([\n",
    "    keras.layers.Dense(30, activation=\"relu\", input_shape=[8]),\n",
    "    keras.layers.Dense(30, activation=\"relu\"),\n",
    "    keras.layers.Dense(1)\n",
    "])    \n",
    "model.compile(loss=\"mse\", optimizer=keras.optimizers.SGD(lr=1e-3))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "metadata": {},
   "outputs": [],
   "source": [
    "tensorboard_cb = keras.callbacks.TensorBoard(run_logdir)\n",
    "history = model.fit(X_train, y_train, epochs=30,\n",
    "                    validation_data=(X_valid, y_valid),\n",
    "                    callbacks=[checkpoint_cb, tensorboard_cb])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "To start the TensorBoard server, one option is to open a terminal, if needed activate the virtualenv where you installed TensorBoard, go to this notebook's directory, then type:\n",
    "\n",
    "```bash\n",
    "$ tensorboard --logdir=./my_logs --port=6006\n",
    "```\n",
    "\n",
    "You can then open your web browser to [localhost:6006](http://localhost:6006) and use TensorBoard. Once you are done, press Ctrl-C in the terminal window, this will shutdown the TensorBoard server.\n",
    "\n",
    "Alternatively, you can load TensorBoard's Jupyter extension and run it like this:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 87,
   "metadata": {},
   "outputs": [],
   "source": [
    "%load_ext tensorboard\n",
    "%tensorboard --logdir=./my_logs --port=6006"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "metadata": {},
   "outputs": [],
   "source": [
    "run_logdir2 = get_run_logdir()\n",
    "run_logdir2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "metadata": {},
   "outputs": [],
   "source": [
    "keras.backend.clear_session()\n",
    "np.random.seed(42)\n",
    "tf.random.set_seed(42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = keras.models.Sequential([\n",
    "    keras.layers.Dense(30, activation=\"relu\", input_shape=[8]),\n",
    "    keras.layers.Dense(30, activation=\"relu\"),\n",
    "    keras.layers.Dense(1)\n",
    "])    \n",
    "model.compile(loss=\"mse\", optimizer=keras.optimizers.SGD(lr=0.05))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 91,
   "metadata": {},
   "outputs": [],
   "source": [
    "tensorboard_cb = keras.callbacks.TensorBoard(run_logdir2)\n",
    "history = model.fit(X_train, y_train, epochs=30,\n",
    "                    validation_data=(X_valid, y_valid),\n",
    "                    callbacks=[checkpoint_cb, tensorboard_cb])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Notice how TensorBoard now sees two runs, and you can compare the learning curves."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Check out the other available logging options:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 92,
   "metadata": {},
   "outputs": [],
   "source": [
    "help(keras.callbacks.TensorBoard.__init__)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Hyperparameter Tuning"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 93,
   "metadata": {},
   "outputs": [],
   "source": [
    "keras.backend.clear_session()\n",
    "np.random.seed(42)\n",
    "tf.random.set_seed(42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 94,
   "metadata": {},
   "outputs": [],
   "source": [
    "def build_model(n_hidden=1, n_neurons=30, learning_rate=3e-3, input_shape=[8]):\n",
    "    model = keras.models.Sequential()\n",
    "    model.add(keras.layers.InputLayer(input_shape=input_shape))\n",
    "    for layer in range(n_hidden):\n",
    "        model.add(keras.layers.Dense(n_neurons, activation=\"relu\"))\n",
    "    model.add(keras.layers.Dense(1))\n",
    "    optimizer = keras.optimizers.SGD(lr=learning_rate)\n",
    "    model.compile(loss=\"mse\", optimizer=optimizer)\n",
    "    return model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 95,
   "metadata": {},
   "outputs": [],
   "source": [
    "keras_reg = keras.wrappers.scikit_learn.KerasRegressor(build_model)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 96,
   "metadata": {},
   "outputs": [],
   "source": [
    "keras_reg.fit(X_train, y_train, epochs=100,\n",
    "              validation_data=(X_valid, y_valid),\n",
    "              callbacks=[keras.callbacks.EarlyStopping(patience=10)])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 97,
   "metadata": {},
   "outputs": [],
   "source": [
    "mse_test = keras_reg.score(X_test, y_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 98,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = keras_reg.predict(X_new)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 99,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.random.seed(42)\n",
    "tf.random.set_seed(42)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Warning**: the following cell crashes at the end of training. This seems to be caused by [Keras issue #13586](https://github.com/keras-team/keras/issues/13586), which was triggered by a recent change in Scikit-Learn. [Pull Request #13598](https://github.com/keras-team/keras/pull/13598) seems to fix the issue, so this problem should be resolved soon."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 100,
   "metadata": {},
   "outputs": [],
   "source": [
    "from scipy.stats import reciprocal\n",
    "from sklearn.model_selection import RandomizedSearchCV\n",
    "\n",
    "param_distribs = {\n",
    "    \"n_hidden\": [0, 1, 2, 3],\n",
    "    \"n_neurons\": np.arange(1, 100),\n",
    "    \"learning_rate\": reciprocal(3e-4, 3e-2),\n",
    "}\n",
    "\n",
    "rnd_search_cv = RandomizedSearchCV(keras_reg, param_distribs, n_iter=10, cv=3, verbose=2)\n",
    "rnd_search_cv.fit(X_train, y_train, epochs=100,\n",
    "                  validation_data=(X_valid, y_valid),\n",
    "                  callbacks=[keras.callbacks.EarlyStopping(patience=10)])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 101,
   "metadata": {},
   "outputs": [],
   "source": [
    "rnd_search_cv.best_params_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 102,
   "metadata": {},
   "outputs": [],
   "source": [
    "rnd_search_cv.best_score_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 103,
   "metadata": {},
   "outputs": [],
   "source": [
    "rnd_search_cv.best_estimator_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 104,
   "metadata": {},
   "outputs": [],
   "source": [
    "rnd_search_cv.score(X_test, y_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 105,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = rnd_search_cv.best_estimator_.model\n",
    "model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 106,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "model.evaluate(X_test, y_test)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "# Exercise solutions"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1. to 9."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "See appendix A."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 10."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "*Exercise: Train a deep MLP on the MNIST dataset (you can load it using `keras.datasets.mnist.load_data()`. See if you can get over 98% precision. Try searching for the optimal learning rate by using the approach presented in this chapter (i.e., by growing the learning rate exponentially, plotting the loss, and finding the point where the loss shoots up). Try adding all the bells and whistles—save checkpoints, use early stopping, and plot learning curves using TensorBoard.*"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let's load the dataset:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 107,
   "metadata": {},
   "outputs": [],
   "source": [
    "(X_train_full, y_train_full), (X_test, y_test) = keras.datasets.mnist.load_data()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Just like for the Fashion MNIST dataset, the MNIST training set contains 60,000 grayscale images, each 28x28 pixels:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 108,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train_full.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Each pixel intensity is also represented as a byte (0 to 255):"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 109,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train_full.dtype"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let's split the full training set into a validation set and a (smaller) training set. We also scale the pixel intensities down to the 0-1 range and convert them to floats, by dividing by 255, just like we did for Fashion MNIST:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 110,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_valid, X_train = X_train_full[:5000] / 255., X_train_full[5000:] / 255.\n",
    "y_valid, y_train = y_train_full[:5000], y_train_full[5000:]\n",
    "X_test = X_test / 255."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let's plot an image using Matplotlib's `imshow()` function, with a `'binary'`\n",
    " color map:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 111,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.imshow(X_train[0], cmap=\"binary\")\n",
    "plt.axis('off')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The labels are the class IDs (represented as uint8), from 0 to 9. Conveniently, the class IDs correspond to the digits represented in the images, so we don't need a `class_names` array:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 112,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_train"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The validation set contains 5,000 images, and the test set contains 10,000 images:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 113,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_valid.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 114,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_test.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let's take a look at a sample of the images in the dataset:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 115,
   "metadata": {},
   "outputs": [],
   "source": [
    "n_rows = 4\n",
    "n_cols = 10\n",
    "plt.figure(figsize=(n_cols * 1.2, n_rows * 1.2))\n",
    "for row in range(n_rows):\n",
    "    for col in range(n_cols):\n",
    "        index = n_cols * row + col\n",
    "        plt.subplot(n_rows, n_cols, index + 1)\n",
    "        plt.imshow(X_train[index], cmap=\"binary\", interpolation=\"nearest\")\n",
    "        plt.axis('off')\n",
    "        plt.title(y_train[index], fontsize=12)\n",
    "plt.subplots_adjust(wspace=0.2, hspace=0.5)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let's build a simple dense network and find the optimal learning rate. We will need a callback to grow the learning rate at each iteration. It will also record the learning rate and the loss at each iteration:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 116,
   "metadata": {},
   "outputs": [],
   "source": [
    "K = keras.backend\n",
    "\n",
    "class ExponentialLearningRate(keras.callbacks.Callback):\n",
    "    def __init__(self, factor):\n",
    "        self.factor = factor\n",
    "        self.rates = []\n",
    "        self.losses = []\n",
    "    def on_batch_end(self, batch, logs):\n",
    "        self.rates.append(K.get_value(self.model.optimizer.lr))\n",
    "        self.losses.append(logs[\"loss\"])\n",
    "        K.set_value(self.model.optimizer.lr, self.model.optimizer.lr * self.factor)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 117,
   "metadata": {},
   "outputs": [],
   "source": [
    "keras.backend.clear_session()\n",
    "np.random.seed(42)\n",
    "tf.random.set_seed(42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 118,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = keras.models.Sequential([\n",
    "    keras.layers.Flatten(input_shape=[28, 28]),\n",
    "    keras.layers.Dense(300, activation=\"relu\"),\n",
    "    keras.layers.Dense(100, activation=\"relu\"),\n",
    "    keras.layers.Dense(10, activation=\"softmax\")\n",
    "])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We will start with a small learning rate of 1e-3, and grow it by 0.5% at each iteration:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 119,
   "metadata": {},
   "outputs": [],
   "source": [
    "model.compile(loss=\"sparse_categorical_crossentropy\",\n",
    "              optimizer=keras.optimizers.SGD(lr=1e-3),\n",
    "              metrics=[\"accuracy\"])\n",
    "expon_lr = ExponentialLearningRate(factor=1.005)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now let's train the model for just 1 epoch:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 120,
   "metadata": {},
   "outputs": [],
   "source": [
    "history = model.fit(X_train, y_train, epochs=1,\n",
    "                    validation_data=(X_valid, y_valid),\n",
    "                    callbacks=[expon_lr])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We can now plot the loss as a functionof the learning rate:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 121,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.plot(expon_lr.rates, expon_lr.losses)\n",
    "plt.gca().set_xscale('log')\n",
    "plt.hlines(min(expon_lr.losses), min(expon_lr.rates), max(expon_lr.rates))\n",
    "plt.axis([min(expon_lr.rates), max(expon_lr.rates), 0, expon_lr.losses[0]])\n",
    "plt.xlabel(\"Learning rate\")\n",
    "plt.ylabel(\"Loss\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The loss starts shooting back up violently around 3e-1, so let's try using 2e-1 as our learning rate:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 122,
   "metadata": {},
   "outputs": [],
   "source": [
    "keras.backend.clear_session()\n",
    "np.random.seed(42)\n",
    "tf.random.set_seed(42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 123,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = keras.models.Sequential([\n",
    "    keras.layers.Flatten(input_shape=[28, 28]),\n",
    "    keras.layers.Dense(300, activation=\"relu\"),\n",
    "    keras.layers.Dense(100, activation=\"relu\"),\n",
    "    keras.layers.Dense(10, activation=\"softmax\")\n",
    "])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 124,
   "metadata": {},
   "outputs": [],
   "source": [
    "model.compile(loss=\"sparse_categorical_crossentropy\",\n",
    "              optimizer=keras.optimizers.SGD(lr=2e-1),\n",
    "              metrics=[\"accuracy\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 125,
   "metadata": {},
   "outputs": [],
   "source": [
    "run_index = 1 # increment this at every run\n",
    "run_logdir = os.path.join(os.curdir, \"my_mnist_logs\", \"run_{:03d}\".format(run_index))\n",
    "run_logdir"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 126,
   "metadata": {},
   "outputs": [],
   "source": [
    "early_stopping_cb = keras.callbacks.EarlyStopping(patience=20)\n",
    "checkpoint_cb = keras.callbacks.ModelCheckpoint(\"my_mnist_model.h5\", save_best_only=True)\n",
    "tensorboard_cb = keras.callbacks.TensorBoard(run_logdir)\n",
    "\n",
    "history = model.fit(X_train, y_train, epochs=100,\n",
    "                    validation_data=(X_valid, y_valid),\n",
    "                    callbacks=[early_stopping_cb, checkpoint_cb, tensorboard_cb])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 127,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = keras.models.load_model(\"my_mnist_model.h5\") # rollback to best model\n",
    "model.evaluate(X_test, y_test)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We got over 98% accuracy. Finally, let's look at the learning curves using TensorBoard:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 128,
   "metadata": {},
   "outputs": [],
   "source": [
    "%tensorboard --logdir=./my_mnist_logs --port=6006"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  },
  "nav_menu": {
   "height": "264px",
   "width": "369px"
  },
  "toc": {
   "navigate_menu": true,
   "number_sections": true,
   "sideBar": true,
   "threshold": 6,
   "toc_cell": false,
   "toc_section_display": "block",
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}