Synchronize chapter 7's code and the corresponding notebook's code

main
Aurélien Geron 2017-06-02 10:57:06 +02:00
parent 0e8579943c
commit b7779802f0
1 changed files with 258 additions and 132 deletions

View File

@ -134,9 +134,7 @@
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
"collapsed": true
},
"outputs": [],
"source": [
@ -144,23 +142,76 @@
"from sklearn.datasets import make_moons\n",
"\n",
"X, y = make_moons(n_samples=500, noise=0.30, random_state=42)\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)\n",
"\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.ensemble import VotingClassifier\n",
"from sklearn.linear_model import LogisticRegression\n",
"from sklearn.svm import SVC\n",
"\n",
"log_clf = LogisticRegression(random_state=42)\n",
"rnd_clf = RandomForestClassifier(random_state=42)\n",
"svm_clf = SVC(random_state=42)\n",
"\n",
"voting_clf = VotingClassifier(\n",
" estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],\n",
" voting='hard')\n",
"voting_clf.fit(X_train, y_train)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from sklearn.metrics import accuracy_score\n",
"\n",
"for clf in (log_clf, rnd_clf, svm_clf, voting_clf):\n",
" clf.fit(X_train, y_train)\n",
" y_pred = clf.predict(X_test)\n",
" print(clf.__class__.__name__, accuracy_score(y_test, y_pred))"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"log_clf = LogisticRegression(random_state=42)\n",
"rnd_clf = RandomForestClassifier(random_state=42)\n",
"svm_clf = SVC(probability=True, random_state=42)\n",
"\n",
"voting_clf = VotingClassifier(\n",
" estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],\n",
" voting='soft'\n",
" )\n",
"voting_clf.fit(X_train, y_train)\n",
"\n",
" voting='soft')\n",
"voting_clf.fit(X_train, y_train)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from sklearn.metrics import accuracy_score\n",
"\n",
"for clf in (log_clf, rnd_clf, svm_clf, voting_clf):\n",
@ -181,7 +232,25 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 9,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from sklearn.ensemble import BaggingClassifier\n",
"from sklearn.tree import DecisionTreeClassifier\n",
"\n",
"bag_clf = BaggingClassifier(\n",
" DecisionTreeClassifier(random_state=42), n_estimators=500,\n",
" max_samples=100, bootstrap=True, n_jobs=-1, random_state=42)\n",
"bag_clf.fit(X_train, y_train)\n",
"y_pred = bag_clf.predict(X_test)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false,
"deletable": true,
@ -189,23 +258,13 @@
},
"outputs": [],
"source": [
"from sklearn.datasets import make_moons\n",
"from sklearn.ensemble import BaggingClassifier\n",
"from sklearn.metrics import accuracy_score\n",
"from sklearn.tree import DecisionTreeClassifier\n",
"\n",
"bag_clf = BaggingClassifier(\n",
" DecisionTreeClassifier(random_state=42), n_estimators=500,\n",
" max_samples=100, bootstrap=True, n_jobs=-1, random_state=42\n",
" )\n",
"bag_clf.fit(X_train, y_train)\n",
"y_pred = bag_clf.predict(X_test)\n",
"print(accuracy_score(y_test, y_pred))"
]
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 11,
"metadata": {
"collapsed": false,
"deletable": true,
@ -221,7 +280,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 12,
"metadata": {
"collapsed": true,
"deletable": true,
@ -251,7 +310,7 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 13,
"metadata": {
"collapsed": false,
"deletable": true,
@ -282,7 +341,7 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 14,
"metadata": {
"collapsed": false,
"deletable": true,
@ -292,16 +351,24 @@
"source": [
"bag_clf = BaggingClassifier(\n",
" DecisionTreeClassifier(splitter=\"random\", max_leaf_nodes=16, random_state=42),\n",
" n_estimators=500, max_samples=1.0, bootstrap=True,\n",
" n_jobs=-1, random_state=42\n",
" )\n",
" n_estimators=500, max_samples=1.0, bootstrap=True, n_jobs=-1, random_state=42)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"bag_clf.fit(X_train, y_train)\n",
"y_pred = bag_clf.predict(X_test)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 16,
"metadata": {
"collapsed": true,
"deletable": true,
@ -319,7 +386,7 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 17,
"metadata": {
"collapsed": false,
"deletable": true,
@ -332,7 +399,7 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 18,
"metadata": {
"collapsed": false,
"deletable": true,
@ -344,13 +411,13 @@
"iris = load_iris()\n",
"rnd_clf = RandomForestClassifier(n_estimators=500, n_jobs=-1, random_state=42)\n",
"rnd_clf.fit(iris[\"data\"], iris[\"target\"])\n",
"for name, importance in zip(iris[\"feature_names\"], rnd_clf.feature_importances_):\n",
" print(name, \"=\", importance)"
"for name, score in zip(iris[\"feature_names\"], rnd_clf.feature_importances_):\n",
" print(name, score)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 19,
"metadata": {
"collapsed": false,
"deletable": true,
@ -363,7 +430,7 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 20,
"metadata": {
"collapsed": false,
"deletable": true,
@ -374,7 +441,7 @@
"plt.figure(figsize=(6, 4))\n",
"\n",
"for i in range(15):\n",
" tree_clf = DecisionTreeClassifier(max_leaf_nodes=16, random_state=42+i)\n",
" tree_clf = DecisionTreeClassifier(max_leaf_nodes=16, random_state=42 + i)\n",
" indices_with_replacement = rnd.randint(0, len(X_train), len(X_train))\n",
" tree_clf.fit(X[indices_with_replacement], y[indices_with_replacement])\n",
" plot_decision_boundary(tree_clf, X, y, axes=[-1.5, 2.5, -1, 1.5], alpha=0.02, contour=False)\n",
@ -394,7 +461,7 @@
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": 21,
"metadata": {
"collapsed": false,
"deletable": true,
@ -404,15 +471,14 @@
"source": [
"bag_clf = BaggingClassifier(\n",
" DecisionTreeClassifier(random_state=42), n_estimators=500,\n",
" bootstrap=True, n_jobs=-1, oob_score=True, random_state=40\n",
")\n",
" bootstrap=True, n_jobs=-1, oob_score=True, random_state=40)\n",
"bag_clf.fit(X_train, y_train)\n",
"bag_clf.oob_score_"
]
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": 22,
"metadata": {
"collapsed": false,
"deletable": true,
@ -420,12 +486,12 @@
},
"outputs": [],
"source": [
"bag_clf.oob_decision_function_[:10]"
"bag_clf.oob_decision_function_"
]
},
{
"cell_type": "code",
"execution_count": 17,
"execution_count": 23,
"metadata": {
"collapsed": false,
"deletable": true,
@ -450,7 +516,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 24,
"metadata": {
"collapsed": true,
"deletable": true,
@ -458,34 +524,13 @@
},
"outputs": [],
"source": [
"from six.moves import urllib\n",
"from sklearn.datasets import fetch_mldata\n",
"try:\n",
" mnist = fetch_mldata('MNIST original')\n",
"except urllib.error.HTTPError as ex:\n",
" print(\"Could not download MNIST data from mldata.org, trying alternative...\")\n",
"\n",
" # Alternative method to load MNIST, if mldata.org is down\n",
" from scipy.io import loadmat\n",
" mnist_alternative_url = \"https://github.com/amplab/datascience-sp14/raw/master/lab7/mldata/mnist-original.mat\"\n",
" mnist_path = \"./mnist-original.mat\"\n",
" response = urllib.request.urlopen(mnist_alternative_url)\n",
" with open(mnist_path, \"wb\") as f:\n",
" content = response.read()\n",
" f.write(content)\n",
" mnist_raw = loadmat(mnist_path)\n",
" mnist = {\n",
" \"data\": mnist_raw[\"data\"].T,\n",
" \"target\": mnist_raw[\"label\"][0],\n",
" \"COL_NAMES\": [\"label\", \"data\"],\n",
" \"DESCR\": \"mldata.org dataset: mnist-original\",\n",
" }\n",
" print(\"Success!\")"
"mnist = fetch_mldata('MNIST original')"
]
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": 25,
"metadata": {
"collapsed": false,
"deletable": true,
@ -499,7 +544,7 @@
},
{
"cell_type": "code",
"execution_count": 19,
"execution_count": 26,
"metadata": {
"collapsed": true,
"deletable": true,
@ -516,7 +561,7 @@
},
{
"cell_type": "code",
"execution_count": 20,
"execution_count": 27,
"metadata": {
"collapsed": false,
"deletable": true,
@ -545,27 +590,34 @@
},
{
"cell_type": "code",
"execution_count": 21,
"execution_count": 28,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
"collapsed": false
},
"outputs": [],
"source": [
"from sklearn.ensemble import AdaBoostClassifier\n",
"\n",
"ada_clf = AdaBoostClassifier(\n",
" DecisionTreeClassifier(max_depth=2), n_estimators=200,\n",
" algorithm=\"SAMME.R\", learning_rate=0.5, random_state=42\n",
" )\n",
"ada_clf.fit(X_train, y_train)\n",
" DecisionTreeClassifier(max_depth=1), n_estimators=200,\n",
" algorithm=\"SAMME.R\", learning_rate=0.5, random_state=42)\n",
"ada_clf.fit(X_train, y_train)"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"plot_decision_boundary(ada_clf, X, y)"
]
},
{
"cell_type": "code",
"execution_count": 22,
"execution_count": 30,
"metadata": {
"collapsed": false,
"deletable": true,
@ -599,7 +651,7 @@
},
{
"cell_type": "code",
"execution_count": 23,
"execution_count": 31,
"metadata": {
"collapsed": false,
"deletable": true,
@ -622,7 +674,20 @@
},
{
"cell_type": "code",
"execution_count": 24,
"execution_count": 32,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"rnd.seed(42)\n",
"X = rnd.rand(100, 1) - 0.5\n",
"y = 3*X[:, 0]**2 + 0.05 * rnd.randn(100)"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {
"collapsed": false,
"deletable": true,
@ -632,29 +697,72 @@
"source": [
"from sklearn.tree import DecisionTreeRegressor\n",
"\n",
"rnd.seed(42)\n",
"X = rnd.rand(100, 1) - 0.5\n",
"y = 3*X[:, 0]**2 + 0.05 * rnd.randn(100)\n",
"\n",
"tree_reg1 = DecisionTreeRegressor(max_depth=2, random_state=42)\n",
"tree_reg1.fit(X, y)\n",
"\n",
"y2 = y - tree_reg1.predict(X)\n",
"tree_reg2 = DecisionTreeRegressor(max_depth=2, random_state=42)\n",
"tree_reg2.fit(X, y2)\n",
"\n",
"y3 = y2 - tree_reg2.predict(X)\n",
"tree_reg3 = DecisionTreeRegressor(max_depth=2, random_state=42)\n",
"tree_reg3.fit(X, y3)\n",
"\n",
"X_new = np.array([[0.8]])\n",
"y_pred = sum(tree.predict(X_new) for tree in (tree_reg1, tree_reg2, tree_reg3))\n",
"print(y_pred)"
"tree_reg1.fit(X, y)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"execution_count": 34,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"y2 = y - tree_reg1.predict(X)\n",
"tree_reg2 = DecisionTreeRegressor(max_depth=2, random_state=42)\n",
"tree_reg2.fit(X, y2)"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"y3 = y2 - tree_reg2.predict(X)\n",
"tree_reg3 = DecisionTreeRegressor(max_depth=2, random_state=42)\n",
"tree_reg3.fit(X, y3)"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"X_new = np.array([[0.8]])"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"y_pred = sum(tree.predict(X_new) for tree in (tree_reg1, tree_reg2, tree_reg3))"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"y_pred"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {
"collapsed": false,
"deletable": true,
@ -707,7 +815,7 @@
},
{
"cell_type": "code",
"execution_count": 26,
"execution_count": 40,
"metadata": {
"collapsed": false,
"deletable": true,
@ -717,12 +825,30 @@
"source": [
"from sklearn.ensemble import GradientBoostingRegressor\n",
"\n",
"gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=3, learning_rate=0.1, random_state=42)\n",
"gbrt.fit(X, y)\n",
"\n",
"gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=3, learning_rate=1.0, random_state=42)\n",
"gbrt.fit(X, y)"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"gbrt_slow = GradientBoostingRegressor(max_depth=2, n_estimators=200, learning_rate=0.1, random_state=42)\n",
"gbrt_slow.fit(X, y)\n",
"\n",
"gbrt_slow.fit(X, y)"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"plt.figure(figsize=(11,4))\n",
"\n",
"plt.subplot(121)\n",
@ -749,7 +875,7 @@
},
{
"cell_type": "code",
"execution_count": 27,
"execution_count": 43,
"metadata": {
"collapsed": false,
"deletable": true,
@ -757,37 +883,37 @@
},
"outputs": [],
"source": [
"import numpy as np\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.metrics import mean_squared_error\n",
"\n",
"X_train, X_val, y_train, y_val = train_test_split(X, y)\n",
"X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=49)\n",
"\n",
"gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=120, learning_rate=0.1, random_state=42)\n",
"gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=120, random_state=42)\n",
"gbrt.fit(X_train, y_train)\n",
"\n",
"errors = [mean_squared_error(y_val, y_pred) for y_pred in gbrt.staged_predict(X_val)]"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"best_n_estimators = np.argmin(errors)\n",
"min_error = errors[best_n_estimators]\n",
"errors = [mean_squared_error(y_val, y_pred)\n",
" for y_pred in gbrt.staged_predict(X_val)]\n",
"bst_n_estimators = np.argmin(errors)\n",
"\n",
"gbrt_best = GradientBoostingRegressor(max_depth=2, n_estimators=best_n_estimators, learning_rate=0.1, random_state=42)\n",
"gbrt_best = GradientBoostingRegressor(max_depth=2,n_estimators=bst_n_estimators, random_state=42)\n",
"gbrt_best.fit(X_train, y_train)"
]
},
{
"cell_type": "code",
"execution_count": 29,
"execution_count": 44,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"min_error = np.min(errors)"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {
"collapsed": false,
"deletable": true,
@ -799,17 +925,17 @@
"\n",
"plt.subplot(121)\n",
"plt.plot(errors, \"b.-\")\n",
"plt.plot([best_n_estimators, best_n_estimators], [0, min_error], \"k--\")\n",
"plt.plot([bst_n_estimators, bst_n_estimators], [0, min_error], \"k--\")\n",
"plt.plot([0, 120], [min_error, min_error], \"k--\")\n",
"plt.plot(best_n_estimators, min_error, \"ko\")\n",
"plt.text(best_n_estimators, min_error*1.2, \"Minimum\", ha=\"center\", fontsize=14)\n",
"plt.plot(bst_n_estimators, min_error, \"ko\")\n",
"plt.text(bst_n_estimators, min_error*1.2, \"Minimum\", ha=\"center\", fontsize=14)\n",
"plt.axis([0, 120, 0, 0.01])\n",
"plt.xlabel(\"Number of trees\")\n",
"plt.title(\"Validation error\", fontsize=14)\n",
"\n",
"plt.subplot(122)\n",
"plot_predictions([gbrt_best], X, y, axes=[-0.5, 0.5, -0.1, 0.8])\n",
"plt.title(\"Best model (55 trees)\", fontsize=14)\n",
"plt.title(\"Best model (%d trees)\" % bst_n_estimators, fontsize=14)\n",
"\n",
"save_fig(\"early_stopping_gbrt_plot\")\n",
"plt.show()"
@ -817,7 +943,7 @@
},
{
"cell_type": "code",
"execution_count": 30,
"execution_count": 46,
"metadata": {
"collapsed": false,
"deletable": true,
@ -825,7 +951,7 @@
},
"outputs": [],
"source": [
"gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=1, learning_rate=0.1, random_state=42, warm_start=True)\n",
"gbrt = GradientBoostingRegressor(max_depth=2, warm_start=True, random_state=42)\n",
"\n",
"min_val_error = float(\"inf\")\n",
"error_going_up = 0\n",
@ -845,7 +971,7 @@
},
{
"cell_type": "code",
"execution_count": 31,
"execution_count": 47,
"metadata": {
"collapsed": false,
"deletable": true,
@ -905,7 +1031,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2+"
"version": "3.5.3"
},
"nav_menu": {
"height": "252px",