Synchronize chapter 7's code and the corresponding notebook's code

2017-06-02 10:57:06 +02:00 · 2017-06-02 10:57:06 +02:00 · b7779802f0
parent 0e8579943c
commit b7779802f0
1 changed files with 258 additions and 132 deletions
--- a/07_ensemble_learning_and_random_forests.ipynb
+++ b/07_ensemble_learning_and_random_forests.ipynb
@ -134,9 +134,7 @@
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
-    "collapsed": false,
-    "deletable": true,
-    "editable": true
+    "collapsed": true
   },
   "outputs": [],
   "source": [
@ -144,23 +142,76 @@
    "from sklearn.datasets import make_moons\n",
    "\n",
    "X, y = make_moons(n_samples=500, noise=0.30, random_state=42)\n",
-    "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)\n",
-    "\n",
+    "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "collapsed": false,
+    "deletable": true,
+    "editable": true
+   },
+   "outputs": [],
+   "source": [
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.ensemble import VotingClassifier\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.svm import SVC\n",
    "\n",
+    "log_clf = LogisticRegression(random_state=42)\n",
+    "rnd_clf = RandomForestClassifier(random_state=42)\n",
+    "svm_clf = SVC(random_state=42)\n",
+    "\n",
+    "voting_clf = VotingClassifier(\n",
+    "    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],\n",
+    "    voting='hard')\n",
+    "voting_clf.fit(X_train, y_train)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "from sklearn.metrics import accuracy_score\n",
+    "\n",
+    "for clf in (log_clf, rnd_clf, svm_clf, voting_clf):\n",
+    "    clf.fit(X_train, y_train)\n",
+    "    y_pred = clf.predict(X_test)\n",
+    "    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
    "log_clf = LogisticRegression(random_state=42)\n",
    "rnd_clf = RandomForestClassifier(random_state=42)\n",
    "svm_clf = SVC(probability=True, random_state=42)\n",
    "\n",
    "voting_clf = VotingClassifier(\n",
    "    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],\n",
-    "        voting='soft'\n",
-    "    )\n",
-    "voting_clf.fit(X_train, y_train)\n",
-    "\n",
+    "    voting='soft')\n",
+    "voting_clf.fit(X_train, y_train)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
    "from sklearn.metrics import accuracy_score\n",
    "\n",
    "for clf in (log_clf, rnd_clf, svm_clf, voting_clf):\n",
@ -181,7 +232,25 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 9,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "from sklearn.ensemble import BaggingClassifier\n",
+    "from sklearn.tree import DecisionTreeClassifier\n",
+    "\n",
+    "bag_clf = BaggingClassifier(\n",
+    "    DecisionTreeClassifier(random_state=42), n_estimators=500,\n",
+    "    max_samples=100, bootstrap=True, n_jobs=-1, random_state=42)\n",
+    "bag_clf.fit(X_train, y_train)\n",
+    "y_pred = bag_clf.predict(X_test)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
   "metadata": {
    "collapsed": false,
    "deletable": true,
@ -189,23 +258,13 @@
   },
   "outputs": [],
   "source": [
-    "from sklearn.datasets import make_moons\n",
-    "from sklearn.ensemble import BaggingClassifier\n",
    "from sklearn.metrics import accuracy_score\n",
-    "from sklearn.tree import DecisionTreeClassifier\n",
-    "\n",
-    "bag_clf = BaggingClassifier(\n",
-    "        DecisionTreeClassifier(random_state=42), n_estimators=500,\n",
-    "        max_samples=100, bootstrap=True, n_jobs=-1, random_state=42\n",
-    "    )\n",
-    "bag_clf.fit(X_train, y_train)\n",
-    "y_pred = bag_clf.predict(X_test)\n",
    "print(accuracy_score(y_test, y_pred))"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 11,
   "metadata": {
    "collapsed": false,
    "deletable": true,
@ -221,7 +280,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 12,
   "metadata": {
    "collapsed": true,
    "deletable": true,
@ -251,7 +310,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 13,
   "metadata": {
    "collapsed": false,
    "deletable": true,
@ -282,7 +341,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 14,
   "metadata": {
    "collapsed": false,
    "deletable": true,
@ -292,16 +351,24 @@
   "source": [
    "bag_clf = BaggingClassifier(\n",
    "    DecisionTreeClassifier(splitter=\"random\", max_leaf_nodes=16, random_state=42),\n",
-    "        n_estimators=500, max_samples=1.0, bootstrap=True,\n",
-    "        n_jobs=-1, random_state=42\n",
-    "    )\n",
+    "    n_estimators=500, max_samples=1.0, bootstrap=True, n_jobs=-1, random_state=42)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
    "bag_clf.fit(X_train, y_train)\n",
    "y_pred = bag_clf.predict(X_test)"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 16,
   "metadata": {
    "collapsed": true,
    "deletable": true,
@ -319,7 +386,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 17,
   "metadata": {
    "collapsed": false,
    "deletable": true,
@ -332,7 +399,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 18,
   "metadata": {
    "collapsed": false,
    "deletable": true,
@ -344,13 +411,13 @@
    "iris = load_iris()\n",
    "rnd_clf = RandomForestClassifier(n_estimators=500, n_jobs=-1, random_state=42)\n",
    "rnd_clf.fit(iris[\"data\"], iris[\"target\"])\n",
-    "for name, importance in zip(iris[\"feature_names\"], rnd_clf.feature_importances_):\n",
-    "    print(name, \"=\", importance)"
+    "for name, score in zip(iris[\"feature_names\"], rnd_clf.feature_importances_):\n",
+    "    print(name, score)"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 19,
   "metadata": {
    "collapsed": false,
    "deletable": true,
@ -363,7 +430,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 20,
   "metadata": {
    "collapsed": false,
    "deletable": true,
@ -374,7 +441,7 @@
    "plt.figure(figsize=(6, 4))\n",
    "\n",
    "for i in range(15):\n",
-    "    tree_clf = DecisionTreeClassifier(max_leaf_nodes=16, random_state=42+i)\n",
+    "    tree_clf = DecisionTreeClassifier(max_leaf_nodes=16, random_state=42 + i)\n",
    "    indices_with_replacement = rnd.randint(0, len(X_train), len(X_train))\n",
    "    tree_clf.fit(X[indices_with_replacement], y[indices_with_replacement])\n",
    "    plot_decision_boundary(tree_clf, X, y, axes=[-1.5, 2.5, -1, 1.5], alpha=0.02, contour=False)\n",
@ -394,7 +461,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 21,
   "metadata": {
    "collapsed": false,
    "deletable": true,
@ -404,15 +471,14 @@
   "source": [
    "bag_clf = BaggingClassifier(\n",
    "    DecisionTreeClassifier(random_state=42), n_estimators=500,\n",
-    "    bootstrap=True, n_jobs=-1, oob_score=True, random_state=40\n",
-    ")\n",
+    "    bootstrap=True, n_jobs=-1, oob_score=True, random_state=40)\n",
    "bag_clf.fit(X_train, y_train)\n",
    "bag_clf.oob_score_"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 22,
   "metadata": {
    "collapsed": false,
    "deletable": true,
@ -420,12 +486,12 @@
   },
   "outputs": [],
   "source": [
-    "bag_clf.oob_decision_function_[:10]"
+    "bag_clf.oob_decision_function_"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 23,
   "metadata": {
    "collapsed": false,
    "deletable": true,
@ -450,7 +516,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 24,
   "metadata": {
    "collapsed": true,
    "deletable": true,
@ -458,34 +524,13 @@
   },
   "outputs": [],
   "source": [
-    "from six.moves import urllib\n",
    "from sklearn.datasets import fetch_mldata\n",
-    "try:\n",
-    "    mnist = fetch_mldata('MNIST original')\n",
-    "except urllib.error.HTTPError as ex:\n",
-    "    print(\"Could not download MNIST data from mldata.org, trying alternative...\")\n",
-    "\n",
-    "    # Alternative method to load MNIST, if mldata.org is down\n",
-    "    from scipy.io import loadmat\n",
-    "    mnist_alternative_url = \"https://github.com/amplab/datascience-sp14/raw/master/lab7/mldata/mnist-original.mat\"\n",
-    "    mnist_path = \"./mnist-original.mat\"\n",
-    "    response = urllib.request.urlopen(mnist_alternative_url)\n",
-    "    with open(mnist_path, \"wb\") as f:\n",
-    "        content = response.read()\n",
-    "        f.write(content)\n",
-    "    mnist_raw = loadmat(mnist_path)\n",
-    "    mnist = {\n",
-    "        \"data\": mnist_raw[\"data\"].T,\n",
-    "        \"target\": mnist_raw[\"label\"][0],\n",
-    "        \"COL_NAMES\": [\"label\", \"data\"],\n",
-    "        \"DESCR\": \"mldata.org dataset: mnist-original\",\n",
-    "    }\n",
-    "    print(\"Success!\")"
+    "mnist = fetch_mldata('MNIST original')"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 25,
   "metadata": {
    "collapsed": false,
    "deletable": true,
@ -499,7 +544,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 26,
   "metadata": {
    "collapsed": true,
    "deletable": true,
@ -516,7 +561,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 27,
   "metadata": {
    "collapsed": false,
    "deletable": true,
@ -545,27 +590,34 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 28,
   "metadata": {
-    "collapsed": false,
-    "deletable": true,
-    "editable": true
+    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from sklearn.ensemble import AdaBoostClassifier\n",
    "\n",
    "ada_clf = AdaBoostClassifier(\n",
-    "        DecisionTreeClassifier(max_depth=2), n_estimators=200,\n",
-    "        algorithm=\"SAMME.R\", learning_rate=0.5, random_state=42\n",
-    "    )\n",
-    "ada_clf.fit(X_train, y_train)\n",
+    "    DecisionTreeClassifier(max_depth=1), n_estimators=200,\n",
+    "    algorithm=\"SAMME.R\", learning_rate=0.5, random_state=42)\n",
+    "ada_clf.fit(X_train, y_train)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
    "plot_decision_boundary(ada_clf, X, y)"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 30,
   "metadata": {
    "collapsed": false,
    "deletable": true,
@ -599,7 +651,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 31,
   "metadata": {
    "collapsed": false,
    "deletable": true,
@ -622,7 +674,20 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 32,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "rnd.seed(42)\n",
+    "X = rnd.rand(100, 1) - 0.5\n",
+    "y = 3*X[:, 0]**2 + 0.05 * rnd.randn(100)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
   "metadata": {
    "collapsed": false,
    "deletable": true,
@ -632,29 +697,72 @@
   "source": [
    "from sklearn.tree import DecisionTreeRegressor\n",
    "\n",
-    "rnd.seed(42)\n",
-    "X = rnd.rand(100, 1) - 0.5\n",
-    "y = 3*X[:, 0]**2 + 0.05 * rnd.randn(100)\n",
-    "\n",
    "tree_reg1 = DecisionTreeRegressor(max_depth=2, random_state=42)\n",
-    "tree_reg1.fit(X, y)\n",
-    "\n",
-    "y2 = y - tree_reg1.predict(X)\n",
-    "tree_reg2 = DecisionTreeRegressor(max_depth=2, random_state=42)\n",
-    "tree_reg2.fit(X, y2)\n",
-    "\n",
-    "y3 = y2 - tree_reg2.predict(X)\n",
-    "tree_reg3 = DecisionTreeRegressor(max_depth=2, random_state=42)\n",
-    "tree_reg3.fit(X, y3)\n",
-    "\n",
-    "X_new = np.array([[0.8]])\n",
-    "y_pred = sum(tree.predict(X_new) for tree in (tree_reg1, tree_reg2, tree_reg3))\n",
-    "print(y_pred)"
+    "tree_reg1.fit(X, y)"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": 34,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "y2 = y - tree_reg1.predict(X)\n",
+    "tree_reg2 = DecisionTreeRegressor(max_depth=2, random_state=42)\n",
+    "tree_reg2.fit(X, y2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "y3 = y2 - tree_reg2.predict(X)\n",
+    "tree_reg3 = DecisionTreeRegressor(max_depth=2, random_state=42)\n",
+    "tree_reg3.fit(X, y3)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "X_new = np.array([[0.8]])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "y_pred = sum(tree.predict(X_new) for tree in (tree_reg1, tree_reg2, tree_reg3))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 38,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "y_pred"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
   "metadata": {
    "collapsed": false,
    "deletable": true,
@ -707,7 +815,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 26,
+   "execution_count": 40,
   "metadata": {
    "collapsed": false,
    "deletable": true,
@ -717,12 +825,30 @@
   "source": [
    "from sklearn.ensemble import GradientBoostingRegressor\n",
    "\n",
-    "gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=3, learning_rate=0.1, random_state=42)\n",
-    "gbrt.fit(X, y)\n",
-    "\n",
+    "gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=3, learning_rate=1.0, random_state=42)\n",
+    "gbrt.fit(X, y)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
    "gbrt_slow = GradientBoostingRegressor(max_depth=2, n_estimators=200, learning_rate=0.1, random_state=42)\n",
-    "gbrt_slow.fit(X, y)\n",
-    "\n",
+    "gbrt_slow.fit(X, y)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 42,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
    "plt.figure(figsize=(11,4))\n",
    "\n",
    "plt.subplot(121)\n",
@ -749,7 +875,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 27,
+   "execution_count": 43,
   "metadata": {
    "collapsed": false,
    "deletable": true,
@ -757,37 +883,37 @@
   },
   "outputs": [],
   "source": [
+    "import numpy as np\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.metrics import mean_squared_error\n",
    "\n",
-    "X_train, X_val, y_train, y_val = train_test_split(X, y)\n",
+    "X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=49)\n",
    "\n",
-    "gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=120, learning_rate=0.1, random_state=42)\n",
+    "gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=120, random_state=42)\n",
    "gbrt.fit(X_train, y_train)\n",
    "\n",
-    "errors = [mean_squared_error(y_val, y_pred) for y_pred in gbrt.staged_predict(X_val)]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 28,
-   "metadata": {
-    "collapsed": false,
-    "deletable": true,
-    "editable": true
-   },
-   "outputs": [],
-   "source": [
-    "best_n_estimators = np.argmin(errors)\n",
-    "min_error = errors[best_n_estimators]\n",
+    "errors = [mean_squared_error(y_val, y_pred)\n",
+    "          for y_pred in gbrt.staged_predict(X_val)]\n",
+    "bst_n_estimators = np.argmin(errors)\n",
    "\n",
-    "gbrt_best = GradientBoostingRegressor(max_depth=2, n_estimators=best_n_estimators, learning_rate=0.1, random_state=42)\n",
+    "gbrt_best = GradientBoostingRegressor(max_depth=2,n_estimators=bst_n_estimators, random_state=42)\n",
    "gbrt_best.fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 29,
+   "execution_count": 44,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "min_error = np.min(errors)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 45,
   "metadata": {
    "collapsed": false,
    "deletable": true,
@ -799,17 +925,17 @@
    "\n",
    "plt.subplot(121)\n",
    "plt.plot(errors, \"b.-\")\n",
-    "plt.plot([best_n_estimators, best_n_estimators], [0, min_error], \"k--\")\n",
+    "plt.plot([bst_n_estimators, bst_n_estimators], [0, min_error], \"k--\")\n",
    "plt.plot([0, 120], [min_error, min_error], \"k--\")\n",
-    "plt.plot(best_n_estimators, min_error, \"ko\")\n",
-    "plt.text(best_n_estimators, min_error*1.2, \"Minimum\", ha=\"center\", fontsize=14)\n",
+    "plt.plot(bst_n_estimators, min_error, \"ko\")\n",
+    "plt.text(bst_n_estimators, min_error*1.2, \"Minimum\", ha=\"center\", fontsize=14)\n",
    "plt.axis([0, 120, 0, 0.01])\n",
    "plt.xlabel(\"Number of trees\")\n",
    "plt.title(\"Validation error\", fontsize=14)\n",
    "\n",
    "plt.subplot(122)\n",
    "plot_predictions([gbrt_best], X, y, axes=[-0.5, 0.5, -0.1, 0.8])\n",
-    "plt.title(\"Best model (55 trees)\", fontsize=14)\n",
+    "plt.title(\"Best model (%d trees)\" % bst_n_estimators, fontsize=14)\n",
    "\n",
    "save_fig(\"early_stopping_gbrt_plot\")\n",
    "plt.show()"
@ -817,7 +943,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 30,
+   "execution_count": 46,
   "metadata": {
    "collapsed": false,
    "deletable": true,
@ -825,7 +951,7 @@
   },
   "outputs": [],
   "source": [
-    "gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=1, learning_rate=0.1, random_state=42, warm_start=True)\n",
+    "gbrt = GradientBoostingRegressor(max_depth=2, warm_start=True, random_state=42)\n",
    "\n",
    "min_val_error = float(\"inf\")\n",
    "error_going_up = 0\n",
@ -845,7 +971,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 31,
+   "execution_count": 47,
   "metadata": {
    "collapsed": false,
    "deletable": true,
@ -905,7 +1031,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.5.2+"
+   "version": "3.5.3"
  },
  "nav_menu": {
   "height": "252px",