diff --git a/03_classification.ipynb b/03_classification.ipynb index 1eda357..de19856 100644 --- a/03_classification.ipynb +++ b/03_classification.ipynb @@ -39,7 +39,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "First, let's import a few common modules, ensure MatplotLib plots figures inline and prepare a function to save the figures." + "Let's import a few common modules, ensure MatplotLib plots figures inline and prepare a function to save the figures." ] }, { @@ -52,23 +52,100 @@ "import sys\n", "assert sys.version_info >= (3, 8)\n", "\n", - "# Scikit-Learn ≥1.0 is required\n", + "# Scikit-Learn ≥1.0.1 is required\n", "import sklearn\n", - "assert sklearn.__version__ >= \"1.0\"\n", + "assert sklearn.__version__ >= \"1.0.1\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# MNIST" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.datasets import fetch_openml\n", "\n", - "# Common imports\n", + "mnist = fetch_openml('mnist_784', version=1, as_frame=False)\n", + "mnist.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "mnist.data" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "mnist.target" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ "import numpy as np\n", - "from pathlib import Path\n", "\n", + "X, y = mnist[\"data\"], mnist[\"target\"].astype(np.int8)\n", + "X.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "y.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "28 * 28" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The following cell is not shown in the book. It's just here to define the default font size for the figures, and to define the `save_fig()` function which is used to save the figures in high-resolution for the book, just like we did in the previous chapter:" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ "# To plot pretty figures\n", - "%matplotlib inline\n", + "from pathlib import Path\n", "import matplotlib as mpl\n", - "import matplotlib.pyplot as plt\n", - "mpl.rc('axes', labelsize=14)\n", - "mpl.rc('xtick', labelsize=12)\n", - "mpl.rc('ytick', labelsize=12)\n", "\n", - "# Where to save the figures\n", + "mpl.rc('font', size=12)\n", + "mpl.rc('axes', labelsize=14, titlesize=14)\n", + "mpl.rc('legend', fontsize=14)\n", + "\n", + "# To save the figures in high-res for the book\n", "IMAGES_PATH = Path() / \"images\" / \"classification\"\n", "IMAGES_PATH.mkdir(parents=True, exist_ok=True)\n", "\n", @@ -79,107 +156,25 @@ " plt.savefig(path, format=fig_extension, dpi=resolution)" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# MNIST" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Warning:** since Scikit-Learn 0.24, `fetch_openml()` returns a Pandas `DataFrame` by default. To avoid this and keep the same code as in the book, we use `as_frame=False`." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn.datasets import fetch_openml\n", - "mnist = fetch_openml('mnist_784', version=1, as_frame=False)\n", - "mnist.keys()" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "X, y = mnist[\"data\"], mnist[\"target\"]\n", - "X.shape" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "y.shape" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "28 * 28" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "%matplotlib inline\n", - "import matplotlib as mpl\n", - "import matplotlib.pyplot as plt\n", - "\n", - "some_digit = X[0]\n", - "some_digit_image = some_digit.reshape(28, 28)\n", - "plt.imshow(some_digit_image, cmap=mpl.cm.binary)\n", - "plt.axis(\"off\")\n", - "\n", - "save_fig(\"some_digit_plot\")\n", - "plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "y[0]" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "y = y.astype(np.uint8)" - ] - }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ - "def plot_digit(data):\n", - " image = data.reshape(28, 28)\n", - " plt.imshow(image, cmap = mpl.cm.binary,\n", - " interpolation=\"nearest\")\n", - " plt.axis(\"off\")" + "import matplotlib as mpl\n", + "import matplotlib.pyplot as plt\n", + "\n", + "def plot_digit(image_data):\n", + " image = image_data.reshape(28, 28)\n", + " plt.imshow(image, cmap=mpl.cm.binary, interpolation=\"nearest\")\n", + " plt.axis(\"off\")\n", + "\n", + "some_digit = X[0]\n", + "plot_digit(some_digit)\n", + "\n", + "save_fig(\"some_digit_plot\")\n", + "plt.show()" ] }, { @@ -188,29 +183,7 @@ "metadata": {}, "outputs": [], "source": [ - "# EXTRA\n", - "def plot_digits(instances, images_per_row=10, **options):\n", - " size = 28\n", - " images_per_row = min(len(instances), images_per_row)\n", - " # This is equivalent to n_rows = ceil(len(instances) / images_per_row):\n", - " n_rows = (len(instances) - 1) // images_per_row + 1\n", - "\n", - " # Append empty images to fill the end of the grid, if needed:\n", - " n_empty = n_rows * images_per_row - len(instances)\n", - " padded_instances = np.concatenate([instances, np.zeros((n_empty, size * size))], axis=0)\n", - "\n", - " # Reshape the array so it's organized as a grid containing 28×28 images:\n", - " image_grid = padded_instances.reshape((n_rows, images_per_row, size, size))\n", - "\n", - " # Combine axes 0 and 2 (vertical image grid axis, and vertical image axis),\n", - " # and axes 1 and 3 (horizontal axes). We first need to move the axes that we\n", - " # want to combine next to each other, using transpose(), and only then we\n", - " # can reshape:\n", - " big_image = image_grid.transpose(0, 2, 1, 3).reshape(n_rows * size,\n", - " images_per_row * size)\n", - " # Now that we have a big image, we just need to show it:\n", - " plt.imshow(big_image, cmap = mpl.cm.binary, **options)\n", - " plt.axis(\"off\")" + "y[0]" ] }, { @@ -219,10 +192,12 @@ "metadata": {}, "outputs": [], "source": [ - "plt.figure(figsize=(9,9))\n", - "example_images = X[:100]\n", - "plot_digits(example_images, images_per_row=10)\n", - "save_fig(\"more_digits_plot\")\n", + "plt.figure(figsize=(9, 9))\n", + "for idx, image_data in enumerate(X[:100]):\n", + " plt.subplot(10, 10, idx + 1)\n", + " plot_digit(image_data)\n", + "plt.subplots_adjust(wspace=0, hspace=0)\n", + "save_fig(\"more_digits_plot\", tight_layout=False)\n", "plt.show()" ] }, @@ -231,15 +206,6 @@ "execution_count": 12, "metadata": {}, "outputs": [], - "source": [ - "y[0]" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], "source": [ "X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]" ] @@ -253,7 +219,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -262,10 +228,15 @@ ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 14, "metadata": {}, + "outputs": [], "source": [ - "**Note**: some hyperparameters will have a different defaut value in future versions of Scikit-Learn, such as `max_iter` and `tol`. To be future-proof, we explicitly set these hyperparameters to their future default values. For simplicity, this is not shown in the book." + "from sklearn.linear_model import SGDClassifier\n", + "\n", + "sgd_clf = SGDClassifier(random_state=42)\n", + "sgd_clf.fit(X_train, y_train_5)" ] }, { @@ -274,10 +245,7 @@ "metadata": {}, "outputs": [], "source": [ - "from sklearn.linear_model import SGDClassifier\n", - "\n", - "sgd_clf = SGDClassifier(max_iter=1000, tol=1e-3, random_state=42)\n", - "sgd_clf.fit(X_train, y_train_5)" + "sgd_clf.predict([some_digit])" ] }, { @@ -285,17 +253,9 @@ "execution_count": 16, "metadata": {}, "outputs": [], - "source": [ - "sgd_clf.predict([some_digit])" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [], "source": [ "from sklearn.model_selection import cross_val_score\n", + "\n", "cross_val_score(sgd_clf, X_train, y_train_5, cv=3, scoring=\"accuracy\")" ] }, @@ -315,33 +275,38 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import StratifiedKFold\n", - "from sklearn.base import clone\n", "\n", "skfolds = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)\n", "\n", "for train_index, test_index in skfolds.split(X_train, y_train_5):\n", - " clone_clf = clone(sgd_clf)\n", " X_train_folds = X_train[train_index]\n", " y_train_folds = y_train_5[train_index]\n", " X_test_fold = X_train[test_index]\n", " y_test_fold = y_train_5[test_index]\n", "\n", - " clone_clf.fit(X_train_folds, y_train_folds)\n", - " y_pred = clone_clf.predict(X_test_fold)\n", + " sgd_clf_cv = SGDClassifier(random_state=42)\n", + " sgd_clf_cv.fit(X_train_folds, y_train_folds)\n", + " y_pred = sgd_clf_cv.predict(X_test_fold)\n", " n_correct = sum(y_pred == y_test_fold)\n", " print(n_correct / len(y_pred))" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 18, "metadata": {}, + "outputs": [], "source": [ - "**Note**: `shuffle=True` was omitted by mistake in previous releases of the book." + "from sklearn.dummy import DummyClassifier\n", + "\n", + "dummy_clf = DummyClassifier()\n", + "dummy_clf.fit(X_train, y_train_5)\n", + "np.any(dummy_clf.predict(X_train))" ] }, { @@ -350,33 +315,7 @@ "metadata": {}, "outputs": [], "source": [ - "from sklearn.base import BaseEstimator\n", - "class Never5Classifier(BaseEstimator):\n", - " def fit(self, X, y=None):\n", - " pass\n", - " def predict(self, X):\n", - " return np.zeros((len(X), 1), dtype=bool)" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [], - "source": [ - "never_5_clf = Never5Classifier()\n", - "cross_val_score(never_5_clf, X_train, y_train_5, cv=3, scoring=\"accuracy\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Warning**: this output (and many others in this notebook and other notebooks) may differ slightly from those in the book. Don't worry, that's okay! There are several reasons for this:\n", - "* first, Scikit-Learn and other libraries evolve, and algorithms get tweaked a bit, which may change the exact result you get. If you use the latest Scikit-Learn version (and in general, you really should), you probably won't be using the exact same version I used when I wrote the book or this notebook, hence the difference. I try to keep this notebook reasonably up to date, but I can't change the numbers on the pages in your copy of the book.\n", - "* second, many training algorithms are stochastic, meaning they rely on randomness. In principle, it's possible to get consistent outputs from a random number generator by setting the seed from which it generates the pseudo-random numbers (which is why you will see `random_state=42` or `np.random.seed(42)` pretty often). However, sometimes this does not suffice due to the other factors listed here.\n", - "* third, if the training algorithm runs across multiple threads (as do some algorithms implemented in C) or across multiple processes (e.g., when using the `n_jobs` argument), then the precise order in which operations will run is not always guaranteed, and thus the exact result may vary slightly.\n", - "* lastly, other things may prevent perfect reproducibility, such as Python dicts and sets whose order is not guaranteed to be stable across sessions, or the order of files in a directory which is also not guaranteed." + "cross_val_score(dummy_clf, X_train, y_train_5, cv=3, scoring=\"accuracy\")" ] }, { @@ -388,7 +327,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 20, "metadata": {}, "outputs": [], "source": [ @@ -399,7 +338,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 21, "metadata": {}, "outputs": [], "source": [ @@ -410,7 +349,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 22, "metadata": {}, "outputs": [], "source": [ @@ -427,7 +366,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 23, "metadata": {}, "outputs": [], "source": [ @@ -438,7 +377,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 24, "metadata": {}, "outputs": [], "source": [ @@ -448,7 +387,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 25, "metadata": {}, "outputs": [], "source": [ @@ -457,7 +396,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 26, "metadata": {}, "outputs": [], "source": [ @@ -466,7 +405,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 27, "metadata": {}, "outputs": [], "source": [ @@ -477,7 +416,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 28, "metadata": {}, "outputs": [], "source": [ @@ -493,7 +432,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 29, "metadata": {}, "outputs": [], "source": [ @@ -503,7 +442,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 30, "metadata": {}, "outputs": [], "source": [ @@ -513,7 +452,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 31, "metadata": {}, "outputs": [], "source": [ @@ -522,7 +461,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 32, "metadata": {}, "outputs": [], "source": [ @@ -533,7 +472,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 33, "metadata": {}, "outputs": [], "source": [ @@ -543,7 +482,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 34, "metadata": {}, "outputs": [], "source": [ @@ -554,38 +493,33 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 35, "metadata": {}, "outputs": [], "source": [ - "def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):\n", - " plt.plot(thresholds, precisions[:-1], \"b--\", label=\"Precision\", linewidth=2)\n", - " plt.plot(thresholds, recalls[:-1], \"g-\", label=\"Recall\", linewidth=2)\n", - " plt.legend(loc=\"center right\", fontsize=16) # Not shown in the book\n", - " plt.xlabel(\"Threshold\", fontsize=16) # Not shown\n", - " plt.grid(True) # Not shown\n", - " plt.axis([-50000, 50000, 0, 1]) # Not shown\n", - "\n", - "\n", - "\n", "recall_90_precision = recalls[np.argmax(precisions >= 0.90)]\n", "threshold_90_precision = thresholds[np.argmax(precisions >= 0.90)]\n", "\n", + "plt.figure(figsize=(8, 4)) # not in the book\n", + "plt.plot(thresholds, precisions[:-1], \"b--\", label=\"Precision\", linewidth=2)\n", + "plt.plot(thresholds, recalls[:-1], \"g-\", label=\"Recall\", linewidth=2)\n", + "plt.vlines(threshold_90_precision, 0, 1.0, \"k\", \"dotted\", label=\"threshold\")\n", + "\n", + "# not in the book (just beautifies the figure)\n", + "plt.plot(threshold_90_precision, recall_90_precision, \"go\")\n", + "plt.plot(threshold_90_precision, 0.90, \"bo\")\n", + "plt.axis([-50000, 50000, 0, 1])\n", + "plt.grid(True)\n", + "plt.xlabel(\"Threshold\")\n", + "plt.legend(loc=\"center right\")\n", + "save_fig(\"precision_recall_vs_threshold_plot\")\n", "\n", - "plt.figure(figsize=(8, 4)) # Not shown\n", - "plot_precision_recall_vs_threshold(precisions, recalls, thresholds)\n", - "plt.plot([threshold_90_precision, threshold_90_precision], [0., 0.9], \"r:\") # Not shown\n", - "plt.plot([-50000, threshold_90_precision], [0.9, 0.9], \"r:\") # Not shown\n", - "plt.plot([-50000, threshold_90_precision], [recall_90_precision, recall_90_precision], \"r:\")# Not shown\n", - "plt.plot([threshold_90_precision], [0.9], \"ro\") # Not shown\n", - "plt.plot([threshold_90_precision], [recall_90_precision], \"ro\") # Not shown\n", - "save_fig(\"precision_recall_vs_threshold_plot\") # Not shown\n", "plt.show()" ] }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 36, "metadata": {}, "outputs": [], "source": [ @@ -594,29 +528,30 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 37, "metadata": {}, "outputs": [], "source": [ - "def plot_precision_vs_recall(precisions, recalls):\n", - " plt.plot(recalls, precisions, \"b-\", linewidth=2)\n", - " plt.xlabel(\"Recall\", fontsize=16)\n", - " plt.ylabel(\"Precision\", fontsize=16)\n", - " plt.axis([0, 1, 0, 1])\n", - " plt.grid(True)\n", + "plt.figure(figsize=(8, 6)) # not in the book\n", "\n", - "plt.figure(figsize=(8, 6))\n", - "plot_precision_vs_recall(precisions, recalls)\n", - "plt.plot([recall_90_precision, recall_90_precision], [0., 0.9], \"r:\")\n", - "plt.plot([0.0, recall_90_precision], [0.9, 0.9], \"r:\")\n", - "plt.plot([recall_90_precision], [0.9], \"ro\")\n", + "plt.plot(recalls, precisions, linewidth=2)\n", + "plt.plot([recall_90_precision, recall_90_precision], [0., 0.9], \"k:\")\n", + "plt.plot([0.0, recall_90_precision], [0.9, 0.9], \"k:\")\n", + "plt.plot([recall_90_precision], [0.9], \"ko\")\n", + "\n", + "# not in the book (just beautifies the figure)\n", + "plt.xlabel(\"Recall\")\n", + "plt.ylabel(\"Precision\")\n", + "plt.axis([0, 1, 0, 1])\n", + "plt.grid(True)\n", "save_fig(\"precision_vs_recall_plot\")\n", + "\n", "plt.show()" ] }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 38, "metadata": {}, "outputs": [], "source": [ @@ -625,7 +560,7 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 39, "metadata": {}, "outputs": [], "source": [ @@ -634,7 +569,7 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 40, "metadata": {}, "outputs": [], "source": [ @@ -643,7 +578,7 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 41, "metadata": {}, "outputs": [], "source": [ @@ -652,7 +587,7 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 42, "metadata": {}, "outputs": [], "source": [ @@ -668,7 +603,7 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 43, "metadata": {}, "outputs": [], "source": [ @@ -679,31 +614,37 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 44, "metadata": {}, "outputs": [], "source": [ "def plot_roc_curve(fpr, tpr, label=None):\n", - " plt.plot(fpr, tpr, linewidth=2, label=label)\n", - " plt.plot([0, 1], [0, 1], 'k--') # dashed diagonal\n", - " plt.axis([0, 1, 0, 1]) # Not shown in the book\n", - " plt.xlabel('False Positive Rate (Fall-Out)', fontsize=16) # Not shown\n", - " plt.ylabel('True Positive Rate (Recall)', fontsize=16) # Not shown\n", - " plt.grid(True) # Not shown\n", + " plt.plot(fpr, tpr, linewidth=2, label=label) # ROC curve\n", + " plt.plot([0, 1], [0, 1], 'k--') # dashed diagonal\n", "\n", - "plt.figure(figsize=(8, 6)) # Not shown\n", + " # not in the book (just beautifies the figure)\n", + " plt.axis([0, 1, 0, 1])\n", + " plt.xlabel('False Positive Rate (Fall-Out)')\n", + " plt.ylabel('True Positive Rate (Recall)')\n", + " plt.grid(True)\n", + "\n", + "\n", + "plt.figure(figsize=(8, 6)) # Not in the book\n", "plot_roc_curve(fpr, tpr)\n", - "fpr_90 = fpr[np.argmax(tpr >= recall_90_precision)] # Not shown\n", - "plt.plot([fpr_90, fpr_90], [0., recall_90_precision], \"r:\") # Not shown\n", - "plt.plot([0.0, fpr_90], [recall_90_precision, recall_90_precision], \"r:\") # Not shown\n", - "plt.plot([fpr_90], [recall_90_precision], \"ro\") # Not shown\n", - "save_fig(\"roc_curve_plot\") # Not shown\n", + "\n", + "# not in the book (just beautifies the figure)\n", + "fpr_90 = fpr[np.argmax(tpr >= recall_90_precision)]\n", + "plt.plot([fpr_90, fpr_90], [0., recall_90_precision], \"r:\")\n", + "plt.plot([0.0, fpr_90], [recall_90_precision, recall_90_precision], \"r:\")\n", + "plt.plot([fpr_90], [recall_90_precision], \"ro\")\n", + "save_fig(\"roc_curve_plot\")\n", + "\n", "plt.show()" ] }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 45, "metadata": {}, "outputs": [], "source": [ @@ -713,10 +654,16 @@ ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 46, "metadata": {}, + "outputs": [], "source": [ - "**Note**: we set `n_estimators=100` to be future-proof since this will be the default value in Scikit-Learn 0.22." + "from sklearn.ensemble import RandomForestClassifier\n", + "\n", + "forest_clf = RandomForestClassifier(random_state=42)\n", + "y_probas_forest = cross_val_predict(forest_clf, X_train, y_train_5, cv=3,\n", + " method=\"predict_proba\")" ] }, { @@ -725,10 +672,7 @@ "metadata": {}, "outputs": [], "source": [ - "from sklearn.ensemble import RandomForestClassifier\n", - "forest_clf = RandomForestClassifier(n_estimators=100, random_state=42)\n", - "y_probas_forest = cross_val_predict(forest_clf, X_train, y_train_5, cv=3,\n", - " method=\"predict_proba\")" + "y_probas_forest.shape" ] }, { @@ -737,7 +681,7 @@ "metadata": {}, "outputs": [], "source": [ - "y_scores_forest = y_probas_forest[:, 1] # score = proba of positive class\n", + "y_scores_forest = y_probas_forest[:, 1] # 2nd column = proba of positive class\n", "fpr_forest, tpr_forest, thresholds_forest = roc_curve(y_train_5,y_scores_forest)" ] }, @@ -747,6 +691,7 @@ "metadata": {}, "outputs": [], "source": [ + "# not in the book\n", "recall_for_forest = tpr_forest[np.argmax(fpr_forest >= fpr_90)]\n", "\n", "plt.figure(figsize=(8, 6))\n", @@ -758,7 +703,7 @@ "plt.plot([fpr_90, fpr_90], [0., recall_for_forest], \"r:\")\n", "plt.plot([fpr_90], [recall_for_forest], \"ro\")\n", "plt.grid(True)\n", - "plt.legend(loc=\"lower right\", fontsize=16)\n", + "plt.legend(loc=\"lower right\")\n", "save_fig(\"roc_curve_comparison_plot\")\n", "plt.show()" ] @@ -798,6 +743,13 @@ "# Multiclass Classification" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "SVMs do not scale well, so let's only train on the first 2,000 instances, or else this section will take a very long time to run:" + ] + }, { "cell_type": "code", "execution_count": 53, @@ -807,7 +759,7 @@ "from sklearn.svm import SVC\n", "\n", "svm_clf = SVC(gamma=\"auto\", random_state=42)\n", - "svm_clf.fit(X_train[:1000], y_train[:1000]) # y_train, not y_train_5\n", + "svm_clf.fit(X_train[:2000], y_train[:2000]) # y_train, not y_train_5\n", "svm_clf.predict([some_digit])" ] }, @@ -818,7 +770,7 @@ "outputs": [], "source": [ "some_digit_scores = svm_clf.decision_function([some_digit])\n", - "some_digit_scores" + "np.round(some_digit_scores, 2)" ] }, { @@ -827,7 +779,10 @@ "metadata": {}, "outputs": [], "source": [ - "np.argmax(some_digit_scores)" + "# Not in the book\n", + "svm_clf.decision_function_shape = \"ovo\"\n", + "some_digit_scores_ovo = svm_clf.decision_function([some_digit])\n", + "np.round(some_digit_scores_ovo, 2)" ] }, { @@ -836,7 +791,7 @@ "metadata": {}, "outputs": [], "source": [ - "svm_clf.classes_" + "np.argmax(some_digit_scores)" ] }, { @@ -845,7 +800,7 @@ "metadata": {}, "outputs": [], "source": [ - "svm_clf.classes_[5]" + "svm_clf.classes_" ] }, { @@ -854,10 +809,7 @@ "metadata": {}, "outputs": [], "source": [ - "from sklearn.multiclass import OneVsRestClassifier\n", - "ovr_clf = OneVsRestClassifier(SVC(gamma=\"auto\", random_state=42))\n", - "ovr_clf.fit(X_train[:1000], y_train[:1000])\n", - "ovr_clf.predict([some_digit])" + "svm_clf.classes_[5]" ] }, { @@ -866,7 +818,11 @@ "metadata": {}, "outputs": [], "source": [ - "len(ovr_clf.estimators_)" + "from sklearn.multiclass import OneVsRestClassifier\n", + "\n", + "ovr_clf = OneVsRestClassifier(SVC(gamma=\"auto\", random_state=42))\n", + "ovr_clf.fit(X_train[:2000], y_train[:2000])\n", + "ovr_clf.predict([some_digit])" ] }, { @@ -875,8 +831,7 @@ "metadata": {}, "outputs": [], "source": [ - "sgd_clf.fit(X_train, y_train)\n", - "sgd_clf.predict([some_digit])" + "len(ovr_clf.estimators_)" ] }, { @@ -885,14 +840,9 @@ "metadata": {}, "outputs": [], "source": [ - "sgd_clf.decision_function([some_digit])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Warning**: the following two cells may take close to 30 minutes to run, or more depending on your hardware." + "sgd_clf = SGDClassifier(random_state=42)\n", + "sgd_clf.fit(X_train[:2000], y_train[:2000])\n", + "sgd_clf.predict([some_digit])" ] }, { @@ -901,7 +851,7 @@ "metadata": {}, "outputs": [], "source": [ - "cross_val_score(sgd_clf, X_train, y_train, cv=3, scoring=\"accuracy\")" + "np.round(sgd_clf.decision_function([some_digit]))" ] }, { @@ -909,11 +859,23 @@ "execution_count": 63, "metadata": {}, "outputs": [], + "source": [ + "cross_val_score(sgd_clf, X_train[:2000], y_train[:2000],\n", + " cv=3, scoring=\"accuracy\")" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": {}, + "outputs": [], "source": [ "from sklearn.preprocessing import StandardScaler\n", + "\n", "scaler = StandardScaler()\n", "X_train_scaled = scaler.fit_transform(X_train.astype(np.float64))\n", - "cross_val_score(sgd_clf, X_train_scaled, y_train, cv=3, scoring=\"accuracy\")" + "cross_val_score(sgd_clf, X_train_scaled[:2000], y_train[:2000],\n", + " cv=3, scoring=\"accuracy\")" ] }, { @@ -923,30 +885,13 @@ "# Error Analysis" ] }, - { - "cell_type": "code", - "execution_count": 64, - "metadata": {}, - "outputs": [], - "source": [ - "y_train_pred = cross_val_predict(sgd_clf, X_train_scaled, y_train, cv=3)\n", - "conf_mx = confusion_matrix(y_train, y_train_pred)\n", - "conf_mx" - ] - }, { "cell_type": "code", "execution_count": 65, "metadata": {}, "outputs": [], "source": [ - "# since sklearn 0.22, you can use sklearn.metrics.plot_confusion_matrix()\n", - "def plot_confusion_matrix(matrix):\n", - " \"\"\"If you prefer color and a colorbar\"\"\"\n", - " fig = plt.figure(figsize=(8,8))\n", - " ax = fig.add_subplot(111)\n", - " cax = ax.matshow(matrix)\n", - " fig.colorbar(cax)" + "y_train_pred = cross_val_predict(sgd_clf, X_train_scaled, y_train, cv=3)" ] }, { @@ -955,8 +900,10 @@ "metadata": {}, "outputs": [], "source": [ - "plt.matshow(conf_mx, cmap=plt.cm.gray)\n", - "save_fig(\"confusion_matrix_plot\", tight_layout=False)\n", + "from sklearn.metrics import ConfusionMatrixDisplay\n", + "\n", + "ConfusionMatrixDisplay.from_predictions(y_train, y_train_pred)\n", + "save_fig(\"confusion_matrix_plot\")\n", "plt.show()" ] }, @@ -966,8 +913,13 @@ "metadata": {}, "outputs": [], "source": [ - "row_sums = conf_mx.sum(axis=1, keepdims=True)\n", - "norm_conf_mx = conf_mx / row_sums" + "error_idx = y_train_pred != y_train\n", + "y_train_pred_error = y_train_pred[error_idx]\n", + "y_train_error = y_train[error_idx]\n", + "ConfusionMatrixDisplay.from_predictions(y_train_error, y_train_pred_error,\n", + " normalize=\"pred\", values_format=\".0%\")\n", + "save_fig(\"confusion_matrix_errors_plot\", tight_layout=False)\n", + "plt.show()" ] }, { @@ -976,10 +928,11 @@ "metadata": {}, "outputs": [], "source": [ - "np.fill_diagonal(norm_conf_mx, 0)\n", - "plt.matshow(norm_conf_mx, cmap=plt.cm.gray)\n", - "save_fig(\"confusion_matrix_errors_plot\", tight_layout=False)\n", - "plt.show()" + "cl_a, cl_b = 3, 5\n", + "X_aa = X_train[(y_train == cl_a) & (y_train_pred == cl_a)]\n", + "X_ab = X_train[(y_train == cl_a) & (y_train_pred == cl_b)]\n", + "X_ba = X_train[(y_train == cl_b) & (y_train_pred == cl_a)]\n", + "X_bb = X_train[(y_train == cl_b) & (y_train_pred == cl_b)]" ] }, { @@ -988,21 +941,41 @@ "metadata": {}, "outputs": [], "source": [ - "cl_a, cl_b = 3, 5\n", - "X_aa = X_train[(y_train == cl_a) & (y_train_pred == cl_a)]\n", - "X_ab = X_train[(y_train == cl_a) & (y_train_pred == cl_b)]\n", - "X_ba = X_train[(y_train == cl_b) & (y_train_pred == cl_a)]\n", - "X_bb = X_train[(y_train == cl_b) & (y_train_pred == cl_b)]\n", - "\n", - "plt.figure(figsize=(8,8))\n", - "plt.subplot(221); plot_digits(X_aa[:25], images_per_row=5)\n", - "plt.subplot(222); plot_digits(X_ab[:25], images_per_row=5)\n", - "plt.subplot(223); plot_digits(X_ba[:25], images_per_row=5)\n", - "plt.subplot(224); plot_digits(X_bb[:25], images_per_row=5)\n", + "# not in the book\n", + "size = 5\n", + "pad = 0.2\n", + "plt.figure(figsize=(size, size))\n", + "for images, (label_col, label_row) in [(X_ba, (0, 0)), (X_bb, (1, 0)),\n", + " (X_aa, (0, 1)), (X_ab, (1, 1))]:\n", + " for idx, image_data in enumerate(images[:size*size]):\n", + " x = idx % size + label_col * (size + pad)\n", + " y = idx // size + label_row * (size + pad)\n", + " plt.imshow(image_data.reshape(28, 28), cmap=\"binary\",\n", + " extent=(x, x + 1, y, y + 1))\n", + "plt.xticks([size / 2, size + pad + size / 2], [str(cl_a), str(cl_b)])\n", + "plt.yticks([size / 2, size + pad + size / 2], [str(cl_b), str(cl_a)])\n", + "plt.plot([size + pad / 2, size + pad / 2], [0, 2 * size + pad], \"k:\")\n", + "plt.plot([0, 2 * size + pad], [size + pad / 2, size + pad / 2], \"k:\")\n", + "plt.axis([0, 2 * size + pad, 0, 2 * size + pad])\n", + "plt.xlabel(\"Predicted label\")\n", + "plt.ylabel(\"True label\")\n", "save_fig(\"error_analysis_digits_plot\")\n", "plt.show()" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note: there are several other ways you could code a plot like this one, but it's a bit hard to get the axis labels right:\n", + "* using [nested GridSpecs](https://matplotlib.org/stable/gallery/subplots_axes_and_figures/gridspec_nested.html)\n", + "* merging all the digits in each block into a single image (then using 2×2 subplots). For example:\n", + " ```python\n", + " X_aa[:25].reshape(5, 5, 28, 28).transpose(0, 2, 1, 3).reshape(5 * 28, 5 * 28)\n", + " ```\n", + "* using [subfigures](https://matplotlib.org/stable/gallery/subplots_axes_and_figures/subfigures.html) (since Matplotlib 3.4)" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -1039,7 +1012,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "**Warning**: the following cell may take a very long time (possibly hours depending on your hardware)." + "**Warning**: the following cell may take a few minutes:" ] }, { @@ -1095,159 +1068,8 @@ "knn_clf.fit(X_train_mod, y_train_mod)\n", "clean_digit = knn_clf.predict([X_test_mod[some_index]])\n", "plot_digit(clean_digit)\n", - "save_fig(\"cleaned_digit_example_plot\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Extra material" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Dummy (ie. random) classifier" - ] - }, - { - "cell_type": "code", - "execution_count": 76, - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn.dummy import DummyClassifier\n", - "dmy_clf = DummyClassifier(strategy=\"prior\")\n", - "y_probas_dmy = cross_val_predict(dmy_clf, X_train, y_train_5, cv=3, method=\"predict_proba\")\n", - "y_scores_dmy = y_probas_dmy[:, 1]" - ] - }, - { - "cell_type": "code", - "execution_count": 77, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "fprr, tprr, thresholdsr = roc_curve(y_train_5, y_scores_dmy)\n", - "plot_roc_curve(fprr, tprr)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## KNN classifier" - ] - }, - { - "cell_type": "code", - "execution_count": 78, - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn.neighbors import KNeighborsClassifier\n", - "knn_clf = KNeighborsClassifier(weights='distance', n_neighbors=4)\n", - "knn_clf.fit(X_train, y_train)" - ] - }, - { - "cell_type": "code", - "execution_count": 79, - "metadata": {}, - "outputs": [], - "source": [ - "y_knn_pred = knn_clf.predict(X_test)" - ] - }, - { - "cell_type": "code", - "execution_count": 80, - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn.metrics import accuracy_score\n", - "accuracy_score(y_test, y_knn_pred)" - ] - }, - { - "cell_type": "code", - "execution_count": 81, - "metadata": {}, - "outputs": [], - "source": [ - "from scipy.ndimage.interpolation import shift\n", - "def shift_digit(digit_array, dx, dy, new=0):\n", - " return shift(digit_array.reshape(28, 28), [dy, dx], cval=new).reshape(784)\n", - "\n", - "plot_digit(shift_digit(some_digit, 5, 1, new=100))" - ] - }, - { - "cell_type": "code", - "execution_count": 82, - "metadata": {}, - "outputs": [], - "source": [ - "X_train_expanded = [X_train]\n", - "y_train_expanded = [y_train]\n", - "for dx, dy in ((1, 0), (-1, 0), (0, 1), (0, -1)):\n", - " shifted_images = np.apply_along_axis(shift_digit, axis=1, arr=X_train, dx=dx, dy=dy)\n", - " X_train_expanded.append(shifted_images)\n", - " y_train_expanded.append(y_train)\n", - "\n", - "X_train_expanded = np.concatenate(X_train_expanded)\n", - "y_train_expanded = np.concatenate(y_train_expanded)\n", - "X_train_expanded.shape, y_train_expanded.shape" - ] - }, - { - "cell_type": "code", - "execution_count": 83, - "metadata": {}, - "outputs": [], - "source": [ - "knn_clf.fit(X_train_expanded, y_train_expanded)" - ] - }, - { - "cell_type": "code", - "execution_count": 84, - "metadata": {}, - "outputs": [], - "source": [ - "y_knn_expanded_pred = knn_clf.predict(X_test)" - ] - }, - { - "cell_type": "code", - "execution_count": 85, - "metadata": {}, - "outputs": [], - "source": [ - "accuracy_score(y_test, y_knn_expanded_pred)" - ] - }, - { - "cell_type": "code", - "execution_count": 86, - "metadata": {}, - "outputs": [], - "source": [ - "ambiguous_digit = X_test[2589]\n", - "knn_clf.predict_proba([ambiguous_digit])" - ] - }, - { - "cell_type": "code", - "execution_count": 87, - "metadata": {}, - "outputs": [], - "source": [ - "plot_digit(ambiguous_digit)" + "save_fig(\"cleaned_digit_example_plot\")\n", + "plt.show()" ] }, { @@ -1268,27 +1090,60 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "**Warning**: the next cell may take close to 16 hours to run, or more depending on your hardware." + "Exercise: _Try to build a classifier for the MNIST dataset that achieves over 97% accuracy on the test set. Hint: the `KNeighborsClassifier` works quite well for this task; you just need to find good hyperparameter values (try a grid search on the `weights` and `n_neighbors` hyperparameters)._" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's start with a simple K-Nearest Neighbors classifier and measure its performance on the test set. This will be our baseline:" ] }, { "cell_type": "code", - "execution_count": 88, + "execution_count": 76, + "metadata": {}, + "outputs": [], + "source": [ + "knn_clf = KNeighborsClassifier()\n", + "knn_clf.fit(X_train, y_train)\n", + "baseline_accuracy = knn_clf.score(X_test, y_test)\n", + "baseline_accuracy" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Great! A regular KNN classifier with the default hyperparameters is already very close to our goal." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's see if we tuning the hyperparameters can help. To speed up the search, let's train only on the first 10,000 images:" + ] + }, + { + "cell_type": "code", + "execution_count": 77, "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import GridSearchCV\n", "\n", - "param_grid = [{'weights': [\"uniform\", \"distance\"], 'n_neighbors': [3, 4, 5]}]\n", + "param_grid = [{'weights': [\"uniform\", \"distance\"], 'n_neighbors': [3, 4, 5, 6]}]\n", "\n", "knn_clf = KNeighborsClassifier()\n", - "grid_search = GridSearchCV(knn_clf, param_grid, cv=5, verbose=3)\n", - "grid_search.fit(X_train, y_train)" + "grid_search = GridSearchCV(knn_clf, param_grid, cv=5)\n", + "grid_search.fit(X_train[:10_000], y_train[:10_000])" ] }, { "cell_type": "code", - "execution_count": 89, + "execution_count": 78, "metadata": {}, "outputs": [], "source": [ @@ -1297,23 +1152,36 @@ }, { "cell_type": "code", - "execution_count": 90, + "execution_count": 79, "metadata": {}, "outputs": [], "source": [ "grid_search.best_score_" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The score dropped, but that was expected since we only trained on 10,000 images. So let's take the best model and train it again on the full training set:" + ] + }, { "cell_type": "code", - "execution_count": 91, + "execution_count": 80, "metadata": {}, "outputs": [], "source": [ - "from sklearn.metrics import accuracy_score\n", - "\n", - "y_pred = grid_search.predict(X_test)\n", - "accuracy_score(y_test, y_pred)" + "grid_search.best_estimator_.fit(X_train, y_train)\n", + "tuned_accuracy = grid_search.score(X_test, y_test)\n", + "tuned_accuracy" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We reached our goal of 97% accuracy! 🥳" ] }, { @@ -1323,9 +1191,23 @@ "## 2. Data Augmentation" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Exercise: _Write a function that can shift an MNIST image in any direction (left, right, up, or down) by one pixel. You can use the `shift()` function from the `scipy.ndimage.interpolation` module. For example, `shift(image, [2, 1], cval=0)` shifts the image two pixels down and one pixel to the right. Then, for each image in the training set, create four shifted copies (one per direction) and add them to the training set. Finally, train your best model on this expanded training set and measure its accuracy on the test set. You should observe that your model performs even better now! This technique of artificially growing the training set is called _data augmentation_ or _training set expansion_._" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's try augmenting the MNIST dataset by adding slightly shifted versions of each image." + ] + }, { "cell_type": "code", - "execution_count": 92, + "execution_count": 81, "metadata": {}, "outputs": [], "source": [ @@ -1334,7 +1216,7 @@ }, { "cell_type": "code", - "execution_count": 93, + "execution_count": 82, "metadata": {}, "outputs": [], "source": [ @@ -1344,39 +1226,56 @@ " return shifted_image.reshape([-1])" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's see if it works:" + ] + }, { "cell_type": "code", - "execution_count": 94, + "execution_count": 83, "metadata": {}, "outputs": [], "source": [ - "image = X_train[1000]\n", + "image = X_train[1000] # some random digit to demo\n", "shifted_image_down = shift_image(image, 0, 5)\n", "shifted_image_left = shift_image(image, -5, 0)\n", "\n", "plt.figure(figsize=(12,3))\n", "plt.subplot(131)\n", - "plt.title(\"Original\", fontsize=14)\n", - "plt.imshow(image.reshape(28, 28), interpolation=\"nearest\", cmap=\"Greys\")\n", + "plt.title(\"Original\")\n", + "plt.imshow(image.reshape(28, 28),\n", + " interpolation=\"nearest\", cmap=\"Greys\")\n", "plt.subplot(132)\n", - "plt.title(\"Shifted down\", fontsize=14)\n", - "plt.imshow(shifted_image_down.reshape(28, 28), interpolation=\"nearest\", cmap=\"Greys\")\n", + "plt.title(\"Shifted down\")\n", + "plt.imshow(shifted_image_down.reshape(28, 28),\n", + " interpolation=\"nearest\", cmap=\"Greys\")\n", "plt.subplot(133)\n", - "plt.title(\"Shifted left\", fontsize=14)\n", - "plt.imshow(shifted_image_left.reshape(28, 28), interpolation=\"nearest\", cmap=\"Greys\")\n", + "plt.title(\"Shifted left\")\n", + "plt.imshow(shifted_image_left.reshape(28, 28),\n", + " interpolation=\"nearest\", cmap=\"Greys\")\n", "plt.show()" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Looks good! Now let's create an augmented training set by shifting every image left, right, up and down by one pixel:" + ] + }, { "cell_type": "code", - "execution_count": 95, + "execution_count": 84, "metadata": {}, "outputs": [], "source": [ "X_train_augmented = [image for image in X_train]\n", "y_train_augmented = [label for label in y_train]\n", "\n", - "for dx, dy in ((1, 0), (-1, 0), (0, 1), (0, -1)):\n", + "for dx, dy in ((-1, 0), (1, 0), (0, 1), (0, -1)):\n", " for image, label in zip(X_train, y_train):\n", " X_train_augmented.append(shift_image(image, dx, dy))\n", " y_train_augmented.append(label)\n", @@ -1385,9 +1284,16 @@ "y_train_augmented = np.array(y_train_augmented)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's shuffle the augmented training set, or else all shifted images will be grouped together:" + ] + }, { "cell_type": "code", - "execution_count": 96, + "execution_count": 85, "metadata": {}, "outputs": [], "source": [ @@ -1396,9 +1302,16 @@ "y_train_augmented = y_train_augmented[shuffle_idx]" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now let's train the model using the best hyperparameters we found in the previous exercise:" + ] + }, { "cell_type": "code", - "execution_count": 97, + "execution_count": 86, "metadata": {}, "outputs": [], "source": [ @@ -1407,7 +1320,7 @@ }, { "cell_type": "code", - "execution_count": 98, + "execution_count": 87, "metadata": {}, "outputs": [], "source": [ @@ -1418,24 +1331,42 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "**Warning**: the following cell may take close to an hour to run, depending on your hardware." + "**Warning**: the following cell may take a few minutes to run." ] }, { "cell_type": "code", - "execution_count": 99, + "execution_count": 88, "metadata": {}, "outputs": [], "source": [ - "y_pred = knn_clf.predict(X_test)\n", - "accuracy_score(y_test, y_pred)" + "augmented_accuracy = knn_clf.score(X_test, y_test)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "By simply augmenting the data, we got a 0.5% accuracy boost. :)" + "By simply augmenting the data, we got a 0.5% accuracy boost. Perhaps this does not sound so impressive, but this actually means that the error rate dropped significantly:" + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "error_rate_change = (1 - augmented_accuracy) / (1 - tuned_accuracy) - 1\n", + "print(f\"error_rate_change = {error_rate_change:.0%}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The error rate dropped quite a bit thanks to data augmentation." ] }, { @@ -1445,6 +1376,13 @@ "## 3. Tackle the Titanic dataset" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Exercise: _Tackle the Titanic dataset. A great place to start is on [Kaggle](https://www.kaggle.com/c/titanic)._" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -1461,7 +1399,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 90, "metadata": {}, "outputs": [], "source": [ @@ -1486,7 +1424,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 91, "metadata": {}, "outputs": [], "source": [ @@ -1509,7 +1447,7 @@ }, { "cell_type": "code", - "execution_count": 103, + "execution_count": 92, "metadata": {}, "outputs": [], "source": [ @@ -1542,7 +1480,7 @@ }, { "cell_type": "code", - "execution_count": 104, + "execution_count": 93, "metadata": {}, "outputs": [], "source": [ @@ -1559,7 +1497,7 @@ }, { "cell_type": "code", - "execution_count": 105, + "execution_count": 94, "metadata": {}, "outputs": [], "source": [ @@ -1568,7 +1506,7 @@ }, { "cell_type": "code", - "execution_count": 106, + "execution_count": 95, "metadata": {}, "outputs": [], "source": [ @@ -1598,7 +1536,7 @@ }, { "cell_type": "code", - "execution_count": 107, + "execution_count": 96, "metadata": {}, "outputs": [], "source": [ @@ -1623,7 +1561,7 @@ }, { "cell_type": "code", - "execution_count": 108, + "execution_count": 97, "metadata": {}, "outputs": [], "source": [ @@ -1639,7 +1577,7 @@ }, { "cell_type": "code", - "execution_count": 109, + "execution_count": 98, "metadata": {}, "outputs": [], "source": [ @@ -1648,7 +1586,7 @@ }, { "cell_type": "code", - "execution_count": 110, + "execution_count": 99, "metadata": {}, "outputs": [], "source": [ @@ -1657,7 +1595,7 @@ }, { "cell_type": "code", - "execution_count": 111, + "execution_count": 100, "metadata": {}, "outputs": [], "source": [ @@ -1680,7 +1618,7 @@ }, { "cell_type": "code", - "execution_count": 112, + "execution_count": 101, "metadata": {}, "outputs": [], "source": [ @@ -1703,7 +1641,7 @@ }, { "cell_type": "code", - "execution_count": 113, + "execution_count": 102, "metadata": {}, "outputs": [], "source": [ @@ -1712,7 +1650,7 @@ }, { "cell_type": "code", - "execution_count": 114, + "execution_count": 103, "metadata": {}, "outputs": [], "source": [ @@ -1732,7 +1670,7 @@ }, { "cell_type": "code", - "execution_count": 115, + "execution_count": 104, "metadata": {}, "outputs": [], "source": [ @@ -1756,7 +1694,7 @@ }, { "cell_type": "code", - "execution_count": 116, + "execution_count": 105, "metadata": {}, "outputs": [], "source": [ @@ -1773,7 +1711,7 @@ }, { "cell_type": "code", - "execution_count": 117, + "execution_count": 106, "metadata": {}, "outputs": [], "source": [ @@ -1789,7 +1727,7 @@ }, { "cell_type": "code", - "execution_count": 118, + "execution_count": 107, "metadata": {}, "outputs": [], "source": [ @@ -1808,7 +1746,7 @@ }, { "cell_type": "code", - "execution_count": 119, + "execution_count": 108, "metadata": {}, "outputs": [], "source": [ @@ -1825,7 +1763,7 @@ }, { "cell_type": "code", - "execution_count": 120, + "execution_count": 109, "metadata": {}, "outputs": [], "source": [ @@ -1851,7 +1789,7 @@ }, { "cell_type": "code", - "execution_count": 121, + "execution_count": 110, "metadata": {}, "outputs": [], "source": [ @@ -1878,7 +1816,7 @@ }, { "cell_type": "code", - "execution_count": 122, + "execution_count": 111, "metadata": {}, "outputs": [], "source": [ @@ -1888,7 +1826,7 @@ "plt.plot([1]*10, svm_scores, \".\")\n", "plt.plot([2]*10, forest_scores, \".\")\n", "plt.boxplot([svm_scores, forest_scores], labels=(\"SVM\",\"Random Forest\"))\n", - "plt.ylabel(\"Accuracy\", fontsize=14)\n", + "plt.ylabel(\"Accuracy\")\n", "plt.show()" ] }, @@ -1914,7 +1852,7 @@ }, { "cell_type": "code", - "execution_count": 123, + "execution_count": 112, "metadata": {}, "outputs": [], "source": [ @@ -1924,7 +1862,7 @@ }, { "cell_type": "code", - "execution_count": 124, + "execution_count": 113, "metadata": {}, "outputs": [], "source": [ @@ -1943,12 +1881,21 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "First, let's fetch the data:" + "Exercise: _Build a spam classifier (a more challenging exercise):_\n", + "\n", + "* _Download examples of spam and ham from [Apache SpamAssassin's public datasets](https://homl.info/spamassassin)._\n", + "* _Unzip the datasets and familiarize yourself with the data format._\n", + "* _Split the datasets into a training set and a test set._\n", + "* _Write a data preparation pipeline to convert each email into a feature vector. Your preparation pipeline should transform an email into a (sparse) vector that indicates the presence or absence of each possible word. For example, if all emails only ever contain four words, \"Hello,\" \"how,\" \"are,\" \"you,\" then the email \"Hello you Hello Hello you\" would be converted into a vector [1, 0, 0, 1] (meaning [“Hello\" is present, \"how\" is absent, \"are\" is absent, \"you\" is present]), or [3, 0, 0, 2] if you prefer to count the number of occurrences of each word._\n", + "\n", + "_You may want to add hyperparameters to your preparation pipeline to control whether or not to strip off email headers, convert each email to lowercase, remove punctuation, replace all URLs with \"URL,\" replace all numbers with \"NUMBER,\" or even perform _stemming_ (i.e., trim off word endings; there are Python libraries available to do this)._\n", + "\n", + "_Finally, try out several classifiers and see if you can build a great spam classifier, with both high recall and high precision._" ] }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 114, "metadata": {}, "outputs": [], "source": [ @@ -1977,7 +1924,7 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 115, "metadata": {}, "outputs": [], "source": [ @@ -1993,7 +1940,7 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 116, "metadata": {}, "outputs": [], "source": [ @@ -2003,7 +1950,7 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 117, "metadata": {}, "outputs": [], "source": [ @@ -2012,7 +1959,7 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 118, "metadata": {}, "outputs": [], "source": [ @@ -2028,7 +1975,7 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 119, "metadata": {}, "outputs": [], "source": [ @@ -2042,7 +1989,7 @@ }, { "cell_type": "code", - "execution_count": 48, + "execution_count": 120, "metadata": {}, "outputs": [], "source": [ @@ -2059,7 +2006,7 @@ }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 121, "metadata": {}, "outputs": [], "source": [ @@ -2068,7 +2015,7 @@ }, { "cell_type": "code", - "execution_count": 50, + "execution_count": 122, "metadata": {}, "outputs": [], "source": [ @@ -2084,7 +2031,7 @@ }, { "cell_type": "code", - "execution_count": 51, + "execution_count": 123, "metadata": {}, "outputs": [], "source": [ @@ -2103,7 +2050,7 @@ }, { "cell_type": "code", - "execution_count": 52, + "execution_count": 124, "metadata": {}, "outputs": [], "source": [ @@ -2119,7 +2066,7 @@ }, { "cell_type": "code", - "execution_count": 53, + "execution_count": 125, "metadata": {}, "outputs": [], "source": [ @@ -2128,7 +2075,7 @@ }, { "cell_type": "code", - "execution_count": 54, + "execution_count": 126, "metadata": {}, "outputs": [], "source": [ @@ -2151,7 +2098,7 @@ }, { "cell_type": "code", - "execution_count": 55, + "execution_count": 127, "metadata": {}, "outputs": [], "source": [ @@ -2168,7 +2115,7 @@ }, { "cell_type": "code", - "execution_count": 56, + "execution_count": 128, "metadata": {}, "outputs": [], "source": [ @@ -2184,7 +2131,7 @@ }, { "cell_type": "code", - "execution_count": 57, + "execution_count": 129, "metadata": {}, "outputs": [], "source": [ @@ -2206,7 +2153,7 @@ }, { "cell_type": "code", - "execution_count": 58, + "execution_count": 130, "metadata": {}, "outputs": [], "source": [ @@ -2230,7 +2177,7 @@ }, { "cell_type": "code", - "execution_count": 59, + "execution_count": 131, "metadata": {}, "outputs": [], "source": [ @@ -2249,7 +2196,7 @@ }, { "cell_type": "code", - "execution_count": 60, + "execution_count": 132, "metadata": {}, "outputs": [], "source": [ @@ -2265,7 +2212,7 @@ }, { "cell_type": "code", - "execution_count": 61, + "execution_count": 133, "metadata": {}, "outputs": [], "source": [ @@ -2289,7 +2236,7 @@ }, { "cell_type": "code", - "execution_count": 62, + "execution_count": 134, "metadata": {}, "outputs": [], "source": [ @@ -2307,7 +2254,7 @@ }, { "cell_type": "code", - "execution_count": 63, + "execution_count": 135, "metadata": {}, "outputs": [], "source": [ @@ -2333,7 +2280,7 @@ }, { "cell_type": "code", - "execution_count": 67, + "execution_count": 136, "metadata": {}, "outputs": [], "source": [ @@ -2355,7 +2302,7 @@ }, { "cell_type": "code", - "execution_count": 68, + "execution_count": 137, "metadata": {}, "outputs": [], "source": [ @@ -2378,7 +2325,7 @@ }, { "cell_type": "code", - "execution_count": 69, + "execution_count": 138, "metadata": {}, "outputs": [], "source": [ @@ -2430,7 +2377,7 @@ }, { "cell_type": "code", - "execution_count": 70, + "execution_count": 139, "metadata": {}, "outputs": [], "source": [ @@ -2455,7 +2402,7 @@ }, { "cell_type": "code", - "execution_count": 71, + "execution_count": 140, "metadata": {}, "outputs": [], "source": [ @@ -2486,7 +2433,7 @@ }, { "cell_type": "code", - "execution_count": 72, + "execution_count": 141, "metadata": {}, "outputs": [], "source": [ @@ -2497,7 +2444,7 @@ }, { "cell_type": "code", - "execution_count": 73, + "execution_count": 142, "metadata": {}, "outputs": [], "source": [ @@ -2513,7 +2460,7 @@ }, { "cell_type": "code", - "execution_count": 74, + "execution_count": 143, "metadata": {}, "outputs": [], "source": [ @@ -2529,7 +2476,7 @@ }, { "cell_type": "code", - "execution_count": 75, + "execution_count": 144, "metadata": {}, "outputs": [], "source": [ @@ -2552,7 +2499,7 @@ }, { "cell_type": "code", - "execution_count": 76, + "execution_count": 145, "metadata": {}, "outputs": [], "source": [ @@ -2575,7 +2522,7 @@ }, { "cell_type": "code", - "execution_count": 78, + "execution_count": 146, "metadata": {}, "outputs": [], "source": [ @@ -2602,7 +2549,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python 3", "language": "python", "name": "python3" },