diff --git a/07_ensemble_learning_and_random_forests.ipynb b/07_ensemble_learning_and_random_forests.ipynb index 682c899..6598c30 100644 --- a/07_ensemble_learning_and_random_forests.ipynb +++ b/07_ensemble_learning_and_random_forests.ipynb @@ -134,9 +134,7 @@ "cell_type": "code", "execution_count": 4, "metadata": { - "collapsed": false, - "deletable": true, - "editable": true + "collapsed": true }, "outputs": [], "source": [ @@ -144,23 +142,76 @@ "from sklearn.datasets import make_moons\n", "\n", "X, y = make_moons(n_samples=500, noise=0.30, random_state=42)\n", - "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)\n", - "\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.ensemble import VotingClassifier\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.svm import SVC\n", "\n", + "log_clf = LogisticRegression(random_state=42)\n", + "rnd_clf = RandomForestClassifier(random_state=42)\n", + "svm_clf = SVC(random_state=42)\n", + "\n", + "voting_clf = VotingClassifier(\n", + " estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],\n", + " voting='hard')\n", + "voting_clf.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from sklearn.metrics import accuracy_score\n", + "\n", + "for clf in (log_clf, rnd_clf, svm_clf, voting_clf):\n", + " clf.fit(X_train, y_train)\n", + " y_pred = clf.predict(X_test)\n", + " print(clf.__class__.__name__, accuracy_score(y_test, y_pred))" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ "log_clf = LogisticRegression(random_state=42)\n", "rnd_clf = RandomForestClassifier(random_state=42)\n", "svm_clf = SVC(probability=True, random_state=42)\n", "\n", "voting_clf = VotingClassifier(\n", - " estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],\n", - " voting='soft'\n", - " )\n", - "voting_clf.fit(X_train, y_train)\n", - "\n", + " estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],\n", + " voting='soft')\n", + "voting_clf.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ "from sklearn.metrics import accuracy_score\n", "\n", "for clf in (log_clf, rnd_clf, svm_clf, voting_clf):\n", @@ -181,7 +232,25 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 9, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "from sklearn.ensemble import BaggingClassifier\n", + "from sklearn.tree import DecisionTreeClassifier\n", + "\n", + "bag_clf = BaggingClassifier(\n", + " DecisionTreeClassifier(random_state=42), n_estimators=500,\n", + " max_samples=100, bootstrap=True, n_jobs=-1, random_state=42)\n", + "bag_clf.fit(X_train, y_train)\n", + "y_pred = bag_clf.predict(X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, "metadata": { "collapsed": false, "deletable": true, @@ -189,23 +258,13 @@ }, "outputs": [], "source": [ - "from sklearn.datasets import make_moons\n", - "from sklearn.ensemble import BaggingClassifier\n", "from sklearn.metrics import accuracy_score\n", - "from sklearn.tree import DecisionTreeClassifier\n", - "\n", - "bag_clf = BaggingClassifier(\n", - " DecisionTreeClassifier(random_state=42), n_estimators=500,\n", - " max_samples=100, bootstrap=True, n_jobs=-1, random_state=42\n", - " )\n", - "bag_clf.fit(X_train, y_train)\n", - "y_pred = bag_clf.predict(X_test)\n", "print(accuracy_score(y_test, y_pred))" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 11, "metadata": { "collapsed": false, "deletable": true, @@ -221,7 +280,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 12, "metadata": { "collapsed": true, "deletable": true, @@ -251,7 +310,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 13, "metadata": { "collapsed": false, "deletable": true, @@ -282,7 +341,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 14, "metadata": { "collapsed": false, "deletable": true, @@ -291,17 +350,25 @@ "outputs": [], "source": [ "bag_clf = BaggingClassifier(\n", - " DecisionTreeClassifier(splitter=\"random\", max_leaf_nodes=16, random_state=42),\n", - " n_estimators=500, max_samples=1.0, bootstrap=True,\n", - " n_jobs=-1, random_state=42\n", - " )\n", + " DecisionTreeClassifier(splitter=\"random\", max_leaf_nodes=16, random_state=42),\n", + " n_estimators=500, max_samples=1.0, bootstrap=True, n_jobs=-1, random_state=42)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ "bag_clf.fit(X_train, y_train)\n", "y_pred = bag_clf.predict(X_test)" ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 16, "metadata": { "collapsed": true, "deletable": true, @@ -319,7 +386,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 17, "metadata": { "collapsed": false, "deletable": true, @@ -332,7 +399,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 18, "metadata": { "collapsed": false, "deletable": true, @@ -344,13 +411,13 @@ "iris = load_iris()\n", "rnd_clf = RandomForestClassifier(n_estimators=500, n_jobs=-1, random_state=42)\n", "rnd_clf.fit(iris[\"data\"], iris[\"target\"])\n", - "for name, importance in zip(iris[\"feature_names\"], rnd_clf.feature_importances_):\n", - " print(name, \"=\", importance)" + "for name, score in zip(iris[\"feature_names\"], rnd_clf.feature_importances_):\n", + " print(name, score)" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 19, "metadata": { "collapsed": false, "deletable": true, @@ -363,7 +430,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 20, "metadata": { "collapsed": false, "deletable": true, @@ -374,7 +441,7 @@ "plt.figure(figsize=(6, 4))\n", "\n", "for i in range(15):\n", - " tree_clf = DecisionTreeClassifier(max_leaf_nodes=16, random_state=42+i)\n", + " tree_clf = DecisionTreeClassifier(max_leaf_nodes=16, random_state=42 + i)\n", " indices_with_replacement = rnd.randint(0, len(X_train), len(X_train))\n", " tree_clf.fit(X[indices_with_replacement], y[indices_with_replacement])\n", " plot_decision_boundary(tree_clf, X, y, axes=[-1.5, 2.5, -1, 1.5], alpha=0.02, contour=False)\n", @@ -394,7 +461,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 21, "metadata": { "collapsed": false, "deletable": true, @@ -404,15 +471,14 @@ "source": [ "bag_clf = BaggingClassifier(\n", " DecisionTreeClassifier(random_state=42), n_estimators=500,\n", - " bootstrap=True, n_jobs=-1, oob_score=True, random_state=40\n", - ")\n", + " bootstrap=True, n_jobs=-1, oob_score=True, random_state=40)\n", "bag_clf.fit(X_train, y_train)\n", "bag_clf.oob_score_" ] }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 22, "metadata": { "collapsed": false, "deletable": true, @@ -420,12 +486,12 @@ }, "outputs": [], "source": [ - "bag_clf.oob_decision_function_[:10]" + "bag_clf.oob_decision_function_" ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 23, "metadata": { "collapsed": false, "deletable": true, @@ -450,7 +516,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 24, "metadata": { "collapsed": true, "deletable": true, @@ -458,34 +524,13 @@ }, "outputs": [], "source": [ - "from six.moves import urllib\n", "from sklearn.datasets import fetch_mldata\n", - "try:\n", - " mnist = fetch_mldata('MNIST original')\n", - "except urllib.error.HTTPError as ex:\n", - " print(\"Could not download MNIST data from mldata.org, trying alternative...\")\n", - "\n", - " # Alternative method to load MNIST, if mldata.org is down\n", - " from scipy.io import loadmat\n", - " mnist_alternative_url = \"https://github.com/amplab/datascience-sp14/raw/master/lab7/mldata/mnist-original.mat\"\n", - " mnist_path = \"./mnist-original.mat\"\n", - " response = urllib.request.urlopen(mnist_alternative_url)\n", - " with open(mnist_path, \"wb\") as f:\n", - " content = response.read()\n", - " f.write(content)\n", - " mnist_raw = loadmat(mnist_path)\n", - " mnist = {\n", - " \"data\": mnist_raw[\"data\"].T,\n", - " \"target\": mnist_raw[\"label\"][0],\n", - " \"COL_NAMES\": [\"label\", \"data\"],\n", - " \"DESCR\": \"mldata.org dataset: mnist-original\",\n", - " }\n", - " print(\"Success!\")" + "mnist = fetch_mldata('MNIST original')" ] }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 25, "metadata": { "collapsed": false, "deletable": true, @@ -499,7 +544,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 26, "metadata": { "collapsed": true, "deletable": true, @@ -516,7 +561,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 27, "metadata": { "collapsed": false, "deletable": true, @@ -545,27 +590,34 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 28, "metadata": { - "collapsed": false, - "deletable": true, - "editable": true + "collapsed": false }, "outputs": [], "source": [ "from sklearn.ensemble import AdaBoostClassifier\n", "\n", "ada_clf = AdaBoostClassifier(\n", - " DecisionTreeClassifier(max_depth=2), n_estimators=200,\n", - " algorithm=\"SAMME.R\", learning_rate=0.5, random_state=42\n", - " )\n", - "ada_clf.fit(X_train, y_train)\n", + " DecisionTreeClassifier(max_depth=1), n_estimators=200,\n", + " algorithm=\"SAMME.R\", learning_rate=0.5, random_state=42)\n", + "ada_clf.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ "plot_decision_boundary(ada_clf, X, y)" ] }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 30, "metadata": { "collapsed": false, "deletable": true, @@ -599,7 +651,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 31, "metadata": { "collapsed": false, "deletable": true, @@ -622,7 +674,20 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 32, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "rnd.seed(42)\n", + "X = rnd.rand(100, 1) - 0.5\n", + "y = 3*X[:, 0]**2 + 0.05 * rnd.randn(100)" + ] + }, + { + "cell_type": "code", + "execution_count": 33, "metadata": { "collapsed": false, "deletable": true, @@ -632,29 +697,72 @@ "source": [ "from sklearn.tree import DecisionTreeRegressor\n", "\n", - "rnd.seed(42)\n", - "X = rnd.rand(100, 1) - 0.5\n", - "y = 3*X[:, 0]**2 + 0.05 * rnd.randn(100)\n", - "\n", "tree_reg1 = DecisionTreeRegressor(max_depth=2, random_state=42)\n", - "tree_reg1.fit(X, y)\n", - "\n", - "y2 = y - tree_reg1.predict(X)\n", - "tree_reg2 = DecisionTreeRegressor(max_depth=2, random_state=42)\n", - "tree_reg2.fit(X, y2)\n", - "\n", - "y3 = y2 - tree_reg2.predict(X)\n", - "tree_reg3 = DecisionTreeRegressor(max_depth=2, random_state=42)\n", - "tree_reg3.fit(X, y3)\n", - "\n", - "X_new = np.array([[0.8]])\n", - "y_pred = sum(tree.predict(X_new) for tree in (tree_reg1, tree_reg2, tree_reg3))\n", - "print(y_pred)" + "tree_reg1.fit(X, y)" ] }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 34, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "y2 = y - tree_reg1.predict(X)\n", + "tree_reg2 = DecisionTreeRegressor(max_depth=2, random_state=42)\n", + "tree_reg2.fit(X, y2)" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "y3 = y2 - tree_reg2.predict(X)\n", + "tree_reg3 = DecisionTreeRegressor(max_depth=2, random_state=42)\n", + "tree_reg3.fit(X, y3)" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "X_new = np.array([[0.8]])" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "y_pred = sum(tree.predict(X_new) for tree in (tree_reg1, tree_reg2, tree_reg3))" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "y_pred" + ] + }, + { + "cell_type": "code", + "execution_count": 39, "metadata": { "collapsed": false, "deletable": true, @@ -707,7 +815,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 40, "metadata": { "collapsed": false, "deletable": true, @@ -717,12 +825,30 @@ "source": [ "from sklearn.ensemble import GradientBoostingRegressor\n", "\n", - "gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=3, learning_rate=0.1, random_state=42)\n", - "gbrt.fit(X, y)\n", - "\n", + "gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=3, learning_rate=1.0, random_state=42)\n", + "gbrt.fit(X, y)" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ "gbrt_slow = GradientBoostingRegressor(max_depth=2, n_estimators=200, learning_rate=0.1, random_state=42)\n", - "gbrt_slow.fit(X, y)\n", - "\n", + "gbrt_slow.fit(X, y)" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ "plt.figure(figsize=(11,4))\n", "\n", "plt.subplot(121)\n", @@ -749,7 +875,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 43, "metadata": { "collapsed": false, "deletable": true, @@ -757,37 +883,37 @@ }, "outputs": [], "source": [ + "import numpy as np\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.metrics import mean_squared_error\n", "\n", - "X_train, X_val, y_train, y_val = train_test_split(X, y)\n", + "X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=49)\n", "\n", - "gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=120, learning_rate=0.1, random_state=42)\n", + "gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=120, random_state=42)\n", "gbrt.fit(X_train, y_train)\n", "\n", - "errors = [mean_squared_error(y_val, y_pred) for y_pred in gbrt.staged_predict(X_val)]" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, - "outputs": [], - "source": [ - "best_n_estimators = np.argmin(errors)\n", - "min_error = errors[best_n_estimators]\n", + "errors = [mean_squared_error(y_val, y_pred)\n", + " for y_pred in gbrt.staged_predict(X_val)]\n", + "bst_n_estimators = np.argmin(errors)\n", "\n", - "gbrt_best = GradientBoostingRegressor(max_depth=2, n_estimators=best_n_estimators, learning_rate=0.1, random_state=42)\n", + "gbrt_best = GradientBoostingRegressor(max_depth=2,n_estimators=bst_n_estimators, random_state=42)\n", "gbrt_best.fit(X_train, y_train)" ] }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 44, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "min_error = np.min(errors)" + ] + }, + { + "cell_type": "code", + "execution_count": 45, "metadata": { "collapsed": false, "deletable": true, @@ -799,17 +925,17 @@ "\n", "plt.subplot(121)\n", "plt.plot(errors, \"b.-\")\n", - "plt.plot([best_n_estimators, best_n_estimators], [0, min_error], \"k--\")\n", + "plt.plot([bst_n_estimators, bst_n_estimators], [0, min_error], \"k--\")\n", "plt.plot([0, 120], [min_error, min_error], \"k--\")\n", - "plt.plot(best_n_estimators, min_error, \"ko\")\n", - "plt.text(best_n_estimators, min_error*1.2, \"Minimum\", ha=\"center\", fontsize=14)\n", + "plt.plot(bst_n_estimators, min_error, \"ko\")\n", + "plt.text(bst_n_estimators, min_error*1.2, \"Minimum\", ha=\"center\", fontsize=14)\n", "plt.axis([0, 120, 0, 0.01])\n", "plt.xlabel(\"Number of trees\")\n", "plt.title(\"Validation error\", fontsize=14)\n", "\n", "plt.subplot(122)\n", "plot_predictions([gbrt_best], X, y, axes=[-0.5, 0.5, -0.1, 0.8])\n", - "plt.title(\"Best model (55 trees)\", fontsize=14)\n", + "plt.title(\"Best model (%d trees)\" % bst_n_estimators, fontsize=14)\n", "\n", "save_fig(\"early_stopping_gbrt_plot\")\n", "plt.show()" @@ -817,7 +943,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 46, "metadata": { "collapsed": false, "deletable": true, @@ -825,7 +951,7 @@ }, "outputs": [], "source": [ - "gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=1, learning_rate=0.1, random_state=42, warm_start=True)\n", + "gbrt = GradientBoostingRegressor(max_depth=2, warm_start=True, random_state=42)\n", "\n", "min_val_error = float(\"inf\")\n", "error_going_up = 0\n", @@ -845,7 +971,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 47, "metadata": { "collapsed": false, "deletable": true, @@ -905,7 +1031,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.2+" + "version": "3.5.3" }, "nav_menu": { "height": "252px",