From 581253b47ae9b9b03d84a3553590922a499b880e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Geron?= Date: Tue, 8 May 2018 12:43:49 +0200 Subject: [PATCH] Add xgboost example and upgrade to latest matplotlib version --- 07_ensemble_learning_and_random_forests.ipynb | 241 +++++++++++------- 1 file changed, 151 insertions(+), 90 deletions(-) diff --git a/07_ensemble_learning_and_random_forests.ipynb b/07_ensemble_learning_and_random_forests.ipynb index 2cb163b..570f146 100644 --- a/07_ensemble_learning_and_random_forests.ipynb +++ b/07_ensemble_learning_and_random_forests.ipynb @@ -31,9 +31,7 @@ { "cell_type": "code", "execution_count": 1, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "# To support both python 2 and python 3\n", @@ -107,9 +105,7 @@ { "cell_type": "code", "execution_count": 4, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import train_test_split\n", @@ -136,8 +132,7 @@ "\n", "voting_clf = VotingClassifier(\n", " estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],\n", - " voting='hard')\n", - "voting_clf.fit(X_train, y_train)" + " voting='hard')" ] }, { @@ -145,6 +140,15 @@ "execution_count": 6, "metadata": {}, "outputs": [], + "source": [ + "voting_clf.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], "source": [ "from sklearn.metrics import accuracy_score\n", "\n", @@ -156,7 +160,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -172,7 +176,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -193,10 +197,8 @@ }, { "cell_type": "code", - "execution_count": 9, - "metadata": { - "collapsed": true - }, + "execution_count": 10, + "metadata": {}, "outputs": [], "source": [ "from sklearn.ensemble import BaggingClassifier\n", @@ -211,7 +213,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -221,7 +223,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ @@ -233,10 +235,8 @@ }, { "cell_type": "code", - "execution_count": 12, - "metadata": { - "collapsed": true - }, + "execution_count": 13, + "metadata": {}, "outputs": [], "source": [ "from matplotlib.colors import ListedColormap\n", @@ -248,7 +248,7 @@ " X_new = np.c_[x1.ravel(), x2.ravel()]\n", " y_pred = clf.predict(X_new).reshape(x1.shape)\n", " custom_cmap = ListedColormap(['#fafab0','#9898ff','#a0faa0'])\n", - " plt.contourf(x1, x2, y_pred, alpha=0.3, cmap=custom_cmap, linewidth=10)\n", + " plt.contourf(x1, x2, y_pred, alpha=0.3, cmap=custom_cmap)\n", " if contour:\n", " custom_cmap2 = ListedColormap(['#7d7d58','#4c4c7f','#507d50'])\n", " plt.contour(x1, x2, y_pred, cmap=custom_cmap2, alpha=0.8)\n", @@ -261,7 +261,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ @@ -285,7 +285,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 15, "metadata": {}, "outputs": [], "source": [ @@ -296,7 +296,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ @@ -306,10 +306,8 @@ }, { "cell_type": "code", - "execution_count": 16, - "metadata": { - "collapsed": true - }, + "execution_count": 17, + "metadata": {}, "outputs": [], "source": [ "from sklearn.ensemble import RandomForestClassifier\n", @@ -322,7 +320,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 18, "metadata": {}, "outputs": [], "source": [ @@ -331,7 +329,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 19, "metadata": {}, "outputs": [], "source": [ @@ -345,7 +343,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 20, "metadata": {}, "outputs": [], "source": [ @@ -354,7 +352,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 21, "metadata": {}, "outputs": [], "source": [ @@ -378,7 +376,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 22, "metadata": {}, "outputs": [], "source": [ @@ -391,7 +389,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 23, "metadata": {}, "outputs": [], "source": [ @@ -400,7 +398,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 24, "metadata": {}, "outputs": [], "source": [ @@ -418,10 +416,8 @@ }, { "cell_type": "code", - "execution_count": 24, - "metadata": { - "collapsed": true - }, + "execution_count": 25, + "metadata": {}, "outputs": [], "source": [ "from sklearn.datasets import fetch_mldata\n", @@ -430,7 +426,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 26, "metadata": {}, "outputs": [], "source": [ @@ -440,10 +436,8 @@ }, { "cell_type": "code", - "execution_count": 26, - "metadata": { - "collapsed": true - }, + "execution_count": 27, + "metadata": {}, "outputs": [], "source": [ "def plot_digit(data):\n", @@ -455,7 +449,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 28, "metadata": {}, "outputs": [], "source": [ @@ -477,7 +471,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 29, "metadata": {}, "outputs": [], "source": [ @@ -491,7 +485,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 30, "metadata": {}, "outputs": [], "source": [ @@ -500,7 +494,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 31, "metadata": {}, "outputs": [], "source": [ @@ -509,28 +503,28 @@ "plt.figure(figsize=(11, 4))\n", "for subplot, learning_rate in ((121, 1), (122, 0.5)):\n", " sample_weights = np.ones(m)\n", + " plt.subplot(subplot)\n", " for i in range(5):\n", - " plt.subplot(subplot)\n", " svm_clf = SVC(kernel=\"rbf\", C=0.05, random_state=42)\n", " svm_clf.fit(X_train, y_train, sample_weight=sample_weights)\n", " y_pred = svm_clf.predict(X_train)\n", " sample_weights[y_pred != y_train] *= (1 + learning_rate)\n", " plot_decision_boundary(svm_clf, X, y, alpha=0.2)\n", " plt.title(\"learning_rate = {}\".format(learning_rate), fontsize=16)\n", + " if subplot == 121:\n", + " plt.text(-0.7, -0.65, \"1\", fontsize=14)\n", + " plt.text(-0.6, -0.10, \"2\", fontsize=14)\n", + " plt.text(-0.5, 0.10, \"3\", fontsize=14)\n", + " plt.text(-0.4, 0.55, \"4\", fontsize=14)\n", + " plt.text(-0.3, 0.90, \"5\", fontsize=14)\n", "\n", - "plt.subplot(121)\n", - "plt.text(-0.7, -0.65, \"1\", fontsize=14)\n", - "plt.text(-0.6, -0.10, \"2\", fontsize=14)\n", - "plt.text(-0.5, 0.10, \"3\", fontsize=14)\n", - "plt.text(-0.4, 0.55, \"4\", fontsize=14)\n", - "plt.text(-0.3, 0.90, \"5\", fontsize=14)\n", "save_fig(\"boosting_plot\")\n", "plt.show()" ] }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 32, "metadata": {}, "outputs": [], "source": [ @@ -546,10 +540,8 @@ }, { "cell_type": "code", - "execution_count": 32, - "metadata": { - "collapsed": true - }, + "execution_count": 33, + "metadata": {}, "outputs": [], "source": [ "np.random.seed(42)\n", @@ -559,7 +551,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 34, "metadata": {}, "outputs": [], "source": [ @@ -571,7 +563,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 35, "metadata": {}, "outputs": [], "source": [ @@ -582,7 +574,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 36, "metadata": {}, "outputs": [], "source": [ @@ -591,24 +583,13 @@ "tree_reg3.fit(X, y3)" ] }, - { - "cell_type": "code", - "execution_count": 36, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "X_new = np.array([[0.8]])" - ] - }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [], "source": [ - "y_pred = sum(tree.predict(X_new) for tree in (tree_reg1, tree_reg2, tree_reg3))" + "X_new = np.array([[0.8]])" ] }, { @@ -617,7 +598,7 @@ "metadata": {}, "outputs": [], "source": [ - "y_pred" + "y_pred = sum(tree.predict(X_new) for tree in (tree_reg1, tree_reg2, tree_reg3))" ] }, { @@ -625,6 +606,15 @@ "execution_count": 39, "metadata": {}, "outputs": [], + "source": [ + "y_pred" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [], "source": [ "def plot_predictions(regressors, X, y, axes, label=None, style=\"r-\", data_style=\"b.\", data_label=None):\n", " x1 = np.linspace(axes[0], axes[1], 500)\n", @@ -671,7 +661,7 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 41, "metadata": {}, "outputs": [], "source": [ @@ -683,7 +673,7 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 42, "metadata": {}, "outputs": [], "source": [ @@ -693,7 +683,7 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 43, "metadata": {}, "outputs": [], "source": [ @@ -720,7 +710,7 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 44, "metadata": {}, "outputs": [], "source": [ @@ -743,10 +733,8 @@ }, { "cell_type": "code", - "execution_count": 44, - "metadata": { - "collapsed": true - }, + "execution_count": 45, + "metadata": {}, "outputs": [], "source": [ "min_error = np.min(errors)" @@ -754,7 +742,7 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 46, "metadata": {}, "outputs": [], "source": [ @@ -780,7 +768,7 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 47, "metadata": {}, "outputs": [], "source": [ @@ -804,13 +792,88 @@ }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 48, "metadata": {}, "outputs": [], "source": [ "print(gbrt.n_estimators)" ] }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [], + "source": [ + "print(\"Minimum validation MSE:\", min_val_error)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Using XGBoost" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [], + "source": [ + "try:\n", + " import xgboost\n", + "except ImportError as ex:\n", + " print(\"Error: the xgboost library is not installed.\")\n", + " xgboost = None" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [], + "source": [ + "if xgboost is not None: # not shown in the book\n", + " xgb_reg = xgboost.XGBRegressor(random_state=42)\n", + " xgb_reg.fit(X_train, y_train)\n", + " y_pred = xgb_reg.predict(X_val)\n", + " val_error = mean_squared_error(y_val, y_pred)\n", + " print(\"Validation MSE:\", val_error)" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [], + "source": [ + "if xgboost is not None: # not shown in the book\n", + " xgb_reg.fit(X_train, y_train,\n", + " eval_set=[(X_val, y_val)], early_stopping_rounds=2)\n", + " y_pred = xgb_reg.predict(X_val)\n", + " val_error = mean_squared_error(y_val, y_pred)\n", + " print(\"Validation MSE:\", val_error)" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [], + "source": [ + "%timeit xgboost.XGBRegressor().fit(X_train, y_train) if xgboost is not None else None" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [], + "source": [ + "%timeit GradientBoostingRegressor().fit(X_train, y_train)" + ] + }, { "cell_type": "markdown", "metadata": { @@ -830,9 +893,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [] }