handson-ml/07_ensemble_learning_and_ra...

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Chapter 7 – Ensemble Learning and Random Forests**"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "_This notebook contains all the sample code and solutions to the exercises in chapter 7._"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Setup"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "First, let's make sure this notebook works well in both python 2 and 3, import a few common modules, ensure MatplotLib plots figures inline and prepare a function to save the figures:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# To support both python 2 and python 3\n",
    "from __future__ import division, print_function, unicode_literals\n",
    "\n",
    "# Common imports\n",
    "import numpy as np\n",
    "import os\n",
    "\n",
    "# to make this notebook's output stable across runs\n",
    "np.random.seed(42)\n",
    "\n",
    "# To plot pretty figures\n",
    "%matplotlib inline\n",
    "import matplotlib\n",
    "import matplotlib.pyplot as plt\n",
    "plt.rcParams['axes.labelsize'] = 14\n",
    "plt.rcParams['xtick.labelsize'] = 12\n",
    "plt.rcParams['ytick.labelsize'] = 12\n",
    "\n",
    "# Where to save the figures\n",
    "PROJECT_ROOT_DIR = \".\"\n",
    "CHAPTER_ID = \"ensembles\"\n",
    "\n",
    "def image_path(fig_id):\n",
    "    return os.path.join(PROJECT_ROOT_DIR, \"images\", CHAPTER_ID, fig_id)\n",
    "\n",
    "def save_fig(fig_id, tight_layout=True):\n",
    "    print(\"Saving figure\", fig_id)\n",
    "    if tight_layout:\n",
    "        plt.tight_layout()\n",
    "    plt.savefig(image_path(fig_id) + \".png\", format='png', dpi=300)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Voting classifiers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "heads_proba = 0.51\n",
    "coin_tosses = (np.random.rand(10000, 10) < heads_proba).astype(np.int32)\n",
    "cumulative_heads_ratio = np.cumsum(coin_tosses, axis=0) / np.arange(1, 10001).reshape(-1, 1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.figure(figsize=(8,3.5))\n",
    "plt.plot(cumulative_heads_ratio)\n",
    "plt.plot([0, 10000], [0.51, 0.51], \"k--\", linewidth=2, label=\"51%\")\n",
    "plt.plot([0, 10000], [0.5, 0.5], \"k-\", label=\"50%\")\n",
    "plt.xlabel(\"Number of coin tosses\")\n",
    "plt.ylabel(\"Heads ratio\")\n",
    "plt.legend(loc=\"lower right\")\n",
    "plt.axis([0, 10000, 0.42, 0.58])\n",
    "save_fig(\"law_of_large_numbers_plot\")\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.datasets import make_moons\n",
    "\n",
    "X, y = make_moons(n_samples=500, noise=0.30, random_state=42)\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.ensemble import VotingClassifier\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.svm import SVC\n",
    "\n",
    "log_clf = LogisticRegression(random_state=42)\n",
    "rnd_clf = RandomForestClassifier(random_state=42)\n",
    "svm_clf = SVC(random_state=42)\n",
    "\n",
    "voting_clf = VotingClassifier(\n",
    "    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],\n",
    "    voting='hard')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "voting_clf.fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.metrics import accuracy_score\n",
    "\n",
    "for clf in (log_clf, rnd_clf, svm_clf, voting_clf):\n",
    "    clf.fit(X_train, y_train)\n",
    "    y_pred = clf.predict(X_test)\n",
    "    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "log_clf = LogisticRegression(random_state=42)\n",
    "rnd_clf = RandomForestClassifier(random_state=42)\n",
    "svm_clf = SVC(probability=True, random_state=42)\n",
    "\n",
    "voting_clf = VotingClassifier(\n",
    "    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],\n",
    "    voting='soft')\n",
    "voting_clf.fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.metrics import accuracy_score\n",
    "\n",
    "for clf in (log_clf, rnd_clf, svm_clf, voting_clf):\n",
    "    clf.fit(X_train, y_train)\n",
    "    y_pred = clf.predict(X_test)\n",
    "    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Bagging ensembles"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.ensemble import BaggingClassifier\n",
    "from sklearn.tree import DecisionTreeClassifier\n",
    "\n",
    "bag_clf = BaggingClassifier(\n",
    "    DecisionTreeClassifier(random_state=42), n_estimators=500,\n",
    "    max_samples=100, bootstrap=True, n_jobs=-1, random_state=42)\n",
    "bag_clf.fit(X_train, y_train)\n",
    "y_pred = bag_clf.predict(X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.metrics import accuracy_score\n",
    "print(accuracy_score(y_test, y_pred))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "tree_clf = DecisionTreeClassifier(random_state=42)\n",
    "tree_clf.fit(X_train, y_train)\n",
    "y_pred_tree = tree_clf.predict(X_test)\n",
    "print(accuracy_score(y_test, y_pred_tree))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "from matplotlib.colors import ListedColormap\n",
    "\n",
    "def plot_decision_boundary(clf, X, y, axes=[-1.5, 2.5, -1, 1.5], alpha=0.5, contour=True):\n",
    "    x1s = np.linspace(axes[0], axes[1], 100)\n",
    "    x2s = np.linspace(axes[2], axes[3], 100)\n",
    "    x1, x2 = np.meshgrid(x1s, x2s)\n",
    "    X_new = np.c_[x1.ravel(), x2.ravel()]\n",
    "    y_pred = clf.predict(X_new).reshape(x1.shape)\n",
    "    custom_cmap = ListedColormap(['#fafab0','#9898ff','#a0faa0'])\n",
    "    plt.contourf(x1, x2, y_pred, alpha=0.3, cmap=custom_cmap)\n",
    "    if contour:\n",
    "        custom_cmap2 = ListedColormap(['#7d7d58','#4c4c7f','#507d50'])\n",
    "        plt.contour(x1, x2, y_pred, cmap=custom_cmap2, alpha=0.8)\n",
    "    plt.plot(X[:, 0][y==0], X[:, 1][y==0], \"yo\", alpha=alpha)\n",
    "    plt.plot(X[:, 0][y==1], X[:, 1][y==1], \"bs\", alpha=alpha)\n",
    "    plt.axis(axes)\n",
    "    plt.xlabel(r\"$x_1$\", fontsize=18)\n",
    "    plt.ylabel(r\"$x_2$\", fontsize=18, rotation=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.figure(figsize=(11,4))\n",
    "plt.subplot(121)\n",
    "plot_decision_boundary(tree_clf, X, y)\n",
    "plt.title(\"Decision Tree\", fontsize=14)\n",
    "plt.subplot(122)\n",
    "plot_decision_boundary(bag_clf, X, y)\n",
    "plt.title(\"Decision Trees with Bagging\", fontsize=14)\n",
    "save_fig(\"decision_tree_without_and_with_bagging_plot\")\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Random Forests"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "bag_clf = BaggingClassifier(\n",
    "    DecisionTreeClassifier(splitter=\"random\", max_leaf_nodes=16, random_state=42),\n",
    "    n_estimators=500, max_samples=1.0, bootstrap=True, n_jobs=-1, random_state=42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "bag_clf.fit(X_train, y_train)\n",
    "y_pred = bag_clf.predict(X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.ensemble import RandomForestClassifier\n",
    "\n",
    "rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1, random_state=42)\n",
    "rnd_clf.fit(X_train, y_train)\n",
    "\n",
    "y_pred_rf = rnd_clf.predict(X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.sum(y_pred == y_pred_rf) / len(y_pred)  # almost identical predictions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.datasets import load_iris\n",
    "iris = load_iris()\n",
    "rnd_clf = RandomForestClassifier(n_estimators=500, n_jobs=-1, random_state=42)\n",
    "rnd_clf.fit(iris[\"data\"], iris[\"target\"])\n",
    "for name, score in zip(iris[\"feature_names\"], rnd_clf.feature_importances_):\n",
    "    print(name, score)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "rnd_clf.feature_importances_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.figure(figsize=(6, 4))\n",
    "\n",
    "for i in range(15):\n",
    "    tree_clf = DecisionTreeClassifier(max_leaf_nodes=16, random_state=42 + i)\n",
    "    indices_with_replacement = np.random.randint(0, len(X_train), len(X_train))\n",
    "    tree_clf.fit(X[indices_with_replacement], y[indices_with_replacement])\n",
    "    plot_decision_boundary(tree_clf, X, y, axes=[-1.5, 2.5, -1, 1.5], alpha=0.02, contour=False)\n",
    "\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Out-of-Bag evaluation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "bag_clf = BaggingClassifier(\n",
    "    DecisionTreeClassifier(random_state=42), n_estimators=500,\n",
    "    bootstrap=True, n_jobs=-1, oob_score=True, random_state=40)\n",
    "bag_clf.fit(X_train, y_train)\n",
    "bag_clf.oob_score_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "bag_clf.oob_decision_function_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.metrics import accuracy_score\n",
    "y_pred = bag_clf.predict(X_test)\n",
    "accuracy_score(y_test, y_pred)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Feature importance"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.datasets import fetch_mldata\n",
    "mnist = fetch_mldata('MNIST original')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "rnd_clf = RandomForestClassifier(random_state=42)\n",
    "rnd_clf.fit(mnist[\"data\"], mnist[\"target\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "def plot_digit(data):\n",
    "    image = data.reshape(28, 28)\n",
    "    plt.imshow(image, cmap = matplotlib.cm.hot,\n",
    "               interpolation=\"nearest\")\n",
    "    plt.axis(\"off\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "plot_digit(rnd_clf.feature_importances_)\n",
    "\n",
    "cbar = plt.colorbar(ticks=[rnd_clf.feature_importances_.min(), rnd_clf.feature_importances_.max()])\n",
    "cbar.ax.set_yticklabels(['Not important', 'Very important'])\n",
    "\n",
    "save_fig(\"mnist_feature_importance_plot\")\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# AdaBoost"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.ensemble import AdaBoostClassifier\n",
    "\n",
    "ada_clf = AdaBoostClassifier(\n",
    "    DecisionTreeClassifier(max_depth=1), n_estimators=200,\n",
    "    algorithm=\"SAMME.R\", learning_rate=0.5, random_state=42)\n",
    "ada_clf.fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "plot_decision_boundary(ada_clf, X, y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "m = len(X_train)\n",
    "\n",
    "plt.figure(figsize=(11, 4))\n",
    "for subplot, learning_rate in ((121, 1), (122, 0.5)):\n",
    "    sample_weights = np.ones(m)\n",
    "    plt.subplot(subplot)\n",
    "    for i in range(5):\n",
    "        svm_clf = SVC(kernel=\"rbf\", C=0.05, random_state=42)\n",
    "        svm_clf.fit(X_train, y_train, sample_weight=sample_weights)\n",
    "        y_pred = svm_clf.predict(X_train)\n",
    "        sample_weights[y_pred != y_train] *= (1 + learning_rate)\n",
    "        plot_decision_boundary(svm_clf, X, y, alpha=0.2)\n",
    "        plt.title(\"learning_rate = {}\".format(learning_rate), fontsize=16)\n",
    "    if subplot == 121:\n",
    "        plt.text(-0.7, -0.65, \"1\", fontsize=14)\n",
    "        plt.text(-0.6, -0.10, \"2\", fontsize=14)\n",
    "        plt.text(-0.5,  0.10, \"3\", fontsize=14)\n",
    "        plt.text(-0.4,  0.55, \"4\", fontsize=14)\n",
    "        plt.text(-0.3,  0.90, \"5\", fontsize=14)\n",
    "\n",
    "save_fig(\"boosting_plot\")\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "list(m for m in dir(ada_clf) if not m.startswith(\"_\") and m.endswith(\"_\"))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Gradient Boosting"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.random.seed(42)\n",
    "X = np.random.rand(100, 1) - 0.5\n",
    "y = 3*X[:, 0]**2 + 0.05 * np.random.randn(100)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.tree import DecisionTreeRegressor\n",
    "\n",
    "tree_reg1 = DecisionTreeRegressor(max_depth=2, random_state=42)\n",
    "tree_reg1.fit(X, y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "y2 = y - tree_reg1.predict(X)\n",
    "tree_reg2 = DecisionTreeRegressor(max_depth=2, random_state=42)\n",
    "tree_reg2.fit(X, y2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "y3 = y2 - tree_reg2.predict(X)\n",
    "tree_reg3 = DecisionTreeRegressor(max_depth=2, random_state=42)\n",
    "tree_reg3.fit(X, y3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_new = np.array([[0.8]])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred = sum(tree.predict(X_new) for tree in (tree_reg1, tree_reg2, tree_reg3))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_pred"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [],
   "source": [
    "def plot_predictions(regressors, X, y, axes, label=None, style=\"r-\", data_style=\"b.\", data_label=None):\n",
    "    x1 = np.linspace(axes[0], axes[1], 500)\n",
    "    y_pred = sum(regressor.predict(x1.reshape(-1, 1)) for regressor in regressors)\n",
    "    plt.plot(X[:, 0], y, data_style, label=data_label)\n",
    "    plt.plot(x1, y_pred, style, linewidth=2, label=label)\n",
    "    if label or data_label:\n",
    "        plt.legend(loc=\"upper center\", fontsize=16)\n",
    "    plt.axis(axes)\n",
    "\n",
    "plt.figure(figsize=(11,11))\n",
    "\n",
    "plt.subplot(321)\n",
    "plot_predictions([tree_reg1], X, y, axes=[-0.5, 0.5, -0.1, 0.8], label=\"$h_1(x_1)$\", style=\"g-\", data_label=\"Training set\")\n",
    "plt.ylabel(\"$y$\", fontsize=16, rotation=0)\n",
    "plt.title(\"Residuals and tree predictions\", fontsize=16)\n",
    "\n",
    "plt.subplot(322)\n",
    "plot_predictions([tree_reg1], X, y, axes=[-0.5, 0.5, -0.1, 0.8], label=\"$h(x_1) = h_1(x_1)$\", data_label=\"Training set\")\n",
    "plt.ylabel(\"$y$\", fontsize=16, rotation=0)\n",
    "plt.title(\"Ensemble predictions\", fontsize=16)\n",
    "\n",
    "plt.subplot(323)\n",
    "plot_predictions([tree_reg2], X, y2, axes=[-0.5, 0.5, -0.5, 0.5], label=\"$h_2(x_1)$\", style=\"g-\", data_style=\"k+\", data_label=\"Residuals\")\n",
    "plt.ylabel(\"$y - h_1(x_1)$\", fontsize=16)\n",
    "\n",
    "plt.subplot(324)\n",
    "plot_predictions([tree_reg1, tree_reg2], X, y, axes=[-0.5, 0.5, -0.1, 0.8], label=\"$h(x_1) = h_1(x_1) + h_2(x_1)$\")\n",
    "plt.ylabel(\"$y$\", fontsize=16, rotation=0)\n",
    "\n",
    "plt.subplot(325)\n",
    "plot_predictions([tree_reg3], X, y3, axes=[-0.5, 0.5, -0.5, 0.5], label=\"$h_3(x_1)$\", style=\"g-\", data_style=\"k+\")\n",
    "plt.ylabel(\"$y - h_1(x_1) - h_2(x_1)$\", fontsize=16)\n",
    "plt.xlabel(\"$x_1$\", fontsize=16)\n",
    "\n",
    "plt.subplot(326)\n",
    "plot_predictions([tree_reg1, tree_reg2, tree_reg3], X, y, axes=[-0.5, 0.5, -0.1, 0.8], label=\"$h(x_1) = h_1(x_1) + h_2(x_1) + h_3(x_1)$\")\n",
    "plt.xlabel(\"$x_1$\", fontsize=16)\n",
    "plt.ylabel(\"$y$\", fontsize=16, rotation=0)\n",
    "\n",
    "save_fig(\"gradient_boosting_plot\")\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.ensemble import GradientBoostingRegressor\n",
    "\n",
    "gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=3, learning_rate=1.0, random_state=42)\n",
    "gbrt.fit(X, y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [],
   "source": [
    "gbrt_slow = GradientBoostingRegressor(max_depth=2, n_estimators=200, learning_rate=0.1, random_state=42)\n",
    "gbrt_slow.fit(X, y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.figure(figsize=(11,4))\n",
    "\n",
    "plt.subplot(121)\n",
    "plot_predictions([gbrt], X, y, axes=[-0.5, 0.5, -0.1, 0.8], label=\"Ensemble predictions\")\n",
    "plt.title(\"learning_rate={}, n_estimators={}\".format(gbrt.learning_rate, gbrt.n_estimators), fontsize=14)\n",
    "\n",
    "plt.subplot(122)\n",
    "plot_predictions([gbrt_slow], X, y, axes=[-0.5, 0.5, -0.1, 0.8])\n",
    "plt.title(\"learning_rate={}, n_estimators={}\".format(gbrt_slow.learning_rate, gbrt_slow.n_estimators), fontsize=14)\n",
    "\n",
    "save_fig(\"gbrt_learning_rate_plot\")\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Gradient Boosting with Early stopping"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.metrics import mean_squared_error\n",
    "\n",
    "X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=49)\n",
    "\n",
    "gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=120, random_state=42)\n",
    "gbrt.fit(X_train, y_train)\n",
    "\n",
    "errors = [mean_squared_error(y_val, y_pred)\n",
    "          for y_pred in gbrt.staged_predict(X_val)]\n",
    "bst_n_estimators = np.argmin(errors)\n",
    "\n",
    "gbrt_best = GradientBoostingRegressor(max_depth=2,n_estimators=bst_n_estimators, random_state=42)\n",
    "gbrt_best.fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [],
   "source": [
    "min_error = np.min(errors)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.figure(figsize=(11, 4))\n",
    "\n",
    "plt.subplot(121)\n",
    "plt.plot(errors, \"b.-\")\n",
    "plt.plot([bst_n_estimators, bst_n_estimators], [0, min_error], \"k--\")\n",
    "plt.plot([0, 120], [min_error, min_error], \"k--\")\n",
    "plt.plot(bst_n_estimators, min_error, \"ko\")\n",
    "plt.text(bst_n_estimators, min_error*1.2, \"Minimum\", ha=\"center\", fontsize=14)\n",
    "plt.axis([0, 120, 0, 0.01])\n",
    "plt.xlabel(\"Number of trees\")\n",
    "plt.title(\"Validation error\", fontsize=14)\n",
    "\n",
    "plt.subplot(122)\n",
    "plot_predictions([gbrt_best], X, y, axes=[-0.5, 0.5, -0.1, 0.8])\n",
    "plt.title(\"Best model (%d trees)\" % bst_n_estimators, fontsize=14)\n",
    "\n",
    "save_fig(\"early_stopping_gbrt_plot\")\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [],
   "source": [
    "gbrt = GradientBoostingRegressor(max_depth=2, warm_start=True, random_state=42)\n",
    "\n",
    "min_val_error = float(\"inf\")\n",
    "error_going_up = 0\n",
    "for n_estimators in range(1, 120):\n",
    "    gbrt.n_estimators = n_estimators\n",
    "    gbrt.fit(X_train, y_train)\n",
    "    y_pred = gbrt.predict(X_val)\n",
    "    val_error = mean_squared_error(y_val, y_pred)\n",
    "    if val_error < min_val_error:\n",
    "        min_val_error = val_error\n",
    "        error_going_up = 0\n",
    "    else:\n",
    "        error_going_up += 1\n",
    "        if error_going_up == 5:\n",
    "            break  # early stopping"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(gbrt.n_estimators)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"Minimum validation MSE:\", min_val_error)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Using XGBoost"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [],
   "source": [
    "try:\n",
    "    import xgboost\n",
    "except ImportError as ex:\n",
    "    print(\"Error: the xgboost library is not installed.\")\n",
    "    xgboost = None"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [],
   "source": [
    "if xgboost is not None:  # not shown in the book\n",
    "    xgb_reg = xgboost.XGBRegressor(random_state=42)\n",
    "    xgb_reg.fit(X_train, y_train)\n",
    "    y_pred = xgb_reg.predict(X_val)\n",
    "    val_error = mean_squared_error(y_val, y_pred)\n",
    "    print(\"Validation MSE:\", val_error)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [],
   "source": [
    "if xgboost is not None:  # not shown in the book\n",
    "    xgb_reg.fit(X_train, y_train,\n",
    "                eval_set=[(X_val, y_val)], early_stopping_rounds=2)\n",
    "    y_pred = xgb_reg.predict(X_val)\n",
    "    val_error = mean_squared_error(y_val, y_pred)\n",
    "    print(\"Validation MSE:\", val_error)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [],
   "source": [
    "%timeit xgboost.XGBRegressor().fit(X_train, y_train) if xgboost is not None else None"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [],
   "source": [
    "%timeit GradientBoostingRegressor().fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "# Exercise solutions"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Coming soon**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  },
  "nav_menu": {
   "height": "252px",
   "width": "333px"
  },
  "toc": {
   "navigate_menu": true,
   "number_sections": true,
   "sideBar": true,
   "threshold": 6,
   "toc_cell": false,
   "toc_section_display": "block",
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}