diff --git a/05_support_vector_machines.ipynb b/05_support_vector_machines.ipynb index 687d74b..abbc1c1 100644 --- a/05_support_vector_machines.ipynb +++ b/05_support_vector_machines.ipynb @@ -2,10 +2,7 @@ "cells": [ { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "**Chapter 5 – Support Vector Machines**\n", "\n", @@ -14,20 +11,14 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "# Setup" ] }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "First, let's make sure this notebook works well in both python 2 and 3, import a few common modules, ensure MatplotLib plots figures inline and prepare a function to save the figures:" ] @@ -35,11 +26,7 @@ { "cell_type": "code", "execution_count": 1, - "metadata": { - "collapsed": true, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "# To support both python 2 and python 3\n", @@ -74,20 +61,14 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "# Large margin classification" ] }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "The next few code cells generate the first figures in chapter 5. The first actual code sample comes after:" ] @@ -95,11 +76,7 @@ { "cell_type": "code", "execution_count": 2, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.svm import SVC\n", @@ -121,11 +98,7 @@ { "cell_type": "code", "execution_count": 3, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "# Bad models\n", @@ -179,10 +152,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "# Sensitivity to feature scales" ] @@ -190,11 +160,7 @@ { "cell_type": "code", "execution_count": 4, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "Xs = np.array([[1, 50], [5, 20], [3, 80], [5, 60]]).astype(np.float64)\n", @@ -230,10 +196,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "# Sensitivity to outliers" ] @@ -241,11 +204,7 @@ { "cell_type": "code", "execution_count": 5, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "X_outliers = np.array([[3.4, 1.3], [3.2, 0.8]])\n", @@ -295,20 +254,14 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "# Large margin *vs* margin violations" ] }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "This is the first code example in chapter 5:" ] @@ -316,11 +269,7 @@ { "cell_type": "code", "execution_count": 6, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", @@ -344,11 +293,7 @@ { "cell_type": "code", "execution_count": 7, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "svm_clf.predict([[5.5, 1.7]])" @@ -356,10 +301,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Now let's generate the graph comparing different regularization settings:" ] @@ -367,11 +309,7 @@ { "cell_type": "code", "execution_count": 8, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "scaler = StandardScaler()\n", @@ -394,11 +332,7 @@ { "cell_type": "code", "execution_count": 9, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "# Convert to unscaled parameters\n", @@ -422,11 +356,7 @@ { "cell_type": "code", "execution_count": 10, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "plt.figure(figsize=(12,3.2))\n", @@ -454,9 +384,7 @@ { "cell_type": "markdown", "metadata": { - "collapsed": true, - "deletable": true, - "editable": true + "collapsed": true }, "source": [ "# Non-linear classification" @@ -465,11 +393,7 @@ { "cell_type": "code", "execution_count": 11, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "X1D = np.linspace(-4, 4, 9).reshape(-1, 1)\n", @@ -508,11 +432,7 @@ { "cell_type": "code", "execution_count": 12, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.datasets import make_moons\n", @@ -533,11 +453,7 @@ { "cell_type": "code", "execution_count": 13, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.datasets import make_moons\n", @@ -556,11 +472,7 @@ { "cell_type": "code", "execution_count": 14, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "def plot_predictions(clf, axes):\n", @@ -583,11 +495,7 @@ { "cell_type": "code", "execution_count": 15, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.svm import SVC\n", @@ -602,11 +510,7 @@ { "cell_type": "code", "execution_count": 16, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "poly100_kernel_svm_clf = Pipeline([\n", @@ -619,11 +523,7 @@ { "cell_type": "code", "execution_count": 17, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "plt.figure(figsize=(11, 4))\n", @@ -646,9 +546,6 @@ "cell_type": "code", "execution_count": 18, "metadata": { - "collapsed": false, - "deletable": true, - "editable": true, "scrolled": true }, "outputs": [], @@ -716,11 +613,7 @@ { "cell_type": "code", "execution_count": 19, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "x1_example = X1D[3, 0]\n", @@ -732,11 +625,7 @@ { "cell_type": "code", "execution_count": 20, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "rbf_kernel_svm_clf = Pipeline([\n", @@ -750,9 +639,6 @@ "cell_type": "code", "execution_count": 21, "metadata": { - "collapsed": false, - "deletable": true, - "editable": true, "scrolled": true }, "outputs": [], @@ -787,10 +673,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "# Regression\n" ] @@ -798,11 +681,7 @@ { "cell_type": "code", "execution_count": 22, - "metadata": { - "collapsed": true, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "np.random.seed(42)\n", @@ -814,11 +693,7 @@ { "cell_type": "code", "execution_count": 23, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.svm import LinearSVR\n", @@ -830,11 +705,7 @@ { "cell_type": "code", "execution_count": 24, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "svm_reg1 = LinearSVR(epsilon=1.5, random_state=42)\n", @@ -857,11 +728,7 @@ { "cell_type": "code", "execution_count": 25, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "def plot_svm_regression(svm_reg, X, y, axes):\n", @@ -898,11 +765,7 @@ { "cell_type": "code", "execution_count": 26, - "metadata": { - "collapsed": true, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "np.random.seed(42)\n", @@ -914,11 +777,7 @@ { "cell_type": "code", "execution_count": 27, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.svm import SVR\n", @@ -930,11 +789,7 @@ { "cell_type": "code", "execution_count": 28, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.svm import SVR\n", @@ -948,11 +803,7 @@ { "cell_type": "code", "execution_count": 29, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "plt.figure(figsize=(9, 4))\n", @@ -969,10 +820,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "# Under the hood" ] @@ -980,11 +828,7 @@ { "cell_type": "code", "execution_count": 30, - "metadata": { - "collapsed": true, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "iris = datasets.load_iris()\n", @@ -995,11 +839,7 @@ { "cell_type": "code", "execution_count": 31, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "from mpl_toolkits.mplot3d import Axes3D\n", @@ -1042,10 +882,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "# Small weight vector results in a large margin" ] @@ -1053,11 +890,7 @@ { "cell_type": "code", "execution_count": 32, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "def plot_2D_decision_function(w, b, ylabel=True, x1_lim=[-3, 3]):\n", @@ -1091,11 +924,7 @@ { "cell_type": "code", "execution_count": 33, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.svm import SVC\n", @@ -1112,10 +941,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "# Hinge loss" ] @@ -1123,11 +949,7 @@ { "cell_type": "code", "execution_count": 34, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "t = np.linspace(-2, 4, 200)\n", @@ -1148,20 +970,14 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "# Extra material" ] }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "## Training time" ] @@ -1169,11 +985,7 @@ { "cell_type": "code", "execution_count": 35, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "X, y = make_moons(n_samples=1000, noise=0.4, random_state=42)\n", @@ -1184,11 +996,7 @@ { "cell_type": "code", "execution_count": 36, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "import time\n", @@ -1210,10 +1018,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "## Linear SVM classifier implementation using Batch Gradient Descent" ] @@ -1221,11 +1026,7 @@ { "cell_type": "code", "execution_count": 37, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "# Training set\n", @@ -1236,11 +1037,7 @@ { "cell_type": "code", "execution_count": 38, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.base import BaseEstimator\n", @@ -1286,7 +1083,7 @@ "\n", " self.intercept_ = np.array([b])\n", " self.coef_ = np.array([w])\n", - " support_vectors_idx = (X_t.dot(w) + b < 1).ravel()\n", + " support_vectors_idx = (X_t.dot(w) + t * b < 1).ravel()\n", " self.support_vectors_ = X[support_vectors_idx]\n", " return self\n", "\n", @@ -1305,11 +1102,7 @@ { "cell_type": "code", "execution_count": 39, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "plt.plot(range(svm_clf.n_epochs), svm_clf.Js)\n", @@ -1319,11 +1112,7 @@ { "cell_type": "code", "execution_count": 40, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "print(svm_clf.intercept_, svm_clf.coef_)" @@ -1332,11 +1121,7 @@ { "cell_type": "code", "execution_count": 41, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "svm_clf2 = SVC(kernel=\"linear\", C=C)\n", @@ -1347,11 +1132,7 @@ { "cell_type": "code", "execution_count": 42, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "yr = y.ravel()\n", @@ -1378,9 +1159,6 @@ "cell_type": "code", "execution_count": 43, "metadata": { - "collapsed": false, - "deletable": true, - "editable": true, "scrolled": true }, "outputs": [], @@ -1412,20 +1190,14 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "# Exercise solutions" ] }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "## 1. to 7." ] @@ -1433,9 +1205,7 @@ { "cell_type": "markdown", "metadata": { - "collapsed": true, - "deletable": true, - "editable": true + "collapsed": true }, "source": [ "See appendix A." @@ -1443,30 +1213,21 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "# 8." ] }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "_Exercise: train a `LinearSVC` on a linearly separable dataset. Then train an `SVC` and a `SGDClassifier` on the same dataset. See if you can get them to produce roughly the same model._" ] }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Let's use the Iris dataset: the Iris Setosa and Iris Versicolor classes are linearly separable." ] @@ -1475,9 +1236,7 @@ "cell_type": "code", "execution_count": 44, "metadata": { - "collapsed": true, - "deletable": true, - "editable": true + "collapsed": true }, "outputs": [], "source": [ @@ -1495,11 +1254,7 @@ { "cell_type": "code", "execution_count": 45, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.svm import SVC, LinearSVC\n", @@ -1528,10 +1283,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Let's plot the decision boundaries of these three models:" ] @@ -1539,11 +1291,7 @@ { "cell_type": "code", "execution_count": 46, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "# Compute the slope and bias of each decision boundary\n", @@ -1576,40 +1324,28 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Close enough!" ] }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "# 9." ] }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "_Exercise: train an SVM classifier on the MNIST dataset. Since SVM classifiers are binary classifiers, you will need to use one-versus-all to classify all 10 digits. You may want to tune the hyperparameters using small validation sets to speed up the process. What accuracy can you reach?_" ] }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "First, let's load the dataset and split it into a training set and a test set. We could use `train_test_split()` but people usually just take the first 60,000 instances for the training set, and the last 10,000 instances for the test set (this makes it possible to compare your model's performance with others): " ] @@ -1617,11 +1353,7 @@ { "cell_type": "code", "execution_count": 47, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.datasets import fetch_mldata\n", @@ -1638,10 +1370,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Many training algorithms are sensitive to the order of the training instances, so it's generally good practice to shuffle them first:" ] @@ -1650,9 +1379,7 @@ "cell_type": "code", "execution_count": 48, "metadata": { - "collapsed": true, - "deletable": true, - "editable": true + "collapsed": true }, "outputs": [], "source": [ @@ -1664,10 +1391,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Let's start simple, with a linear SVM classifier. It will automatically use the One-vs-All (also called One-vs-the-Rest, OvR) strategy, so there's nothing special we need to do. Easy!" ] @@ -1675,11 +1399,7 @@ { "cell_type": "code", "execution_count": 49, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "lin_clf = LinearSVC(random_state=42)\n", @@ -1688,10 +1408,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Let's make predictions on the training set and measure the accuracy (we don't want to measure it on the test set yet, since we have not selected and trained the final model yet):" ] @@ -1699,11 +1416,7 @@ { "cell_type": "code", "execution_count": 50, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.metrics import accuracy_score\n", @@ -1714,10 +1427,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Wow, 82% accuracy on MNIST is a really bad performance. This linear model is certainly too simple for MNIST, but perhaps we just needed to scale the data first:" ] @@ -1725,11 +1435,7 @@ { "cell_type": "code", "execution_count": 51, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "scaler = StandardScaler()\n", @@ -1740,11 +1446,7 @@ { "cell_type": "code", "execution_count": 52, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "lin_clf = LinearSVC(random_state=42)\n", @@ -1754,11 +1456,7 @@ { "cell_type": "code", "execution_count": 53, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "y_pred = lin_clf.predict(X_train_scaled)\n", @@ -1767,10 +1465,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "That's much better (we cut the error rate in two), but still not great at all for MNIST. If we want to use an SVM, we will have to use a kernel. Let's try an `SVC` with an RBF kernel (the default).\n", "\n", @@ -1780,11 +1475,7 @@ { "cell_type": "code", "execution_count": 54, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "svm_clf = SVC(decision_function_shape=\"ovr\")\n", @@ -1794,11 +1485,7 @@ { "cell_type": "code", "execution_count": 55, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "y_pred = svm_clf.predict(X_train_scaled)\n", @@ -1807,10 +1494,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "That's promising, we get better performance even though we trained the model on 6 times less data. Let's tune the hyperparameters by doing a randomized search with cross validation. We will do this on a small dataset just to speed up the process:" ] @@ -1818,11 +1502,7 @@ { "cell_type": "code", "execution_count": 56, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import RandomizedSearchCV\n", @@ -1836,11 +1516,7 @@ { "cell_type": "code", "execution_count": 57, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "rnd_search_cv.best_estimator_" @@ -1849,11 +1525,7 @@ { "cell_type": "code", "execution_count": 58, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "rnd_search_cv.best_score_" @@ -1861,10 +1533,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "This looks pretty low but remember we only trained the model on 1,000 instances. Let's retrain the best estimator on the whole training set (run this at night, it will take hours):" ] @@ -1872,11 +1541,7 @@ { "cell_type": "code", "execution_count": 59, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "rnd_search_cv.best_estimator_.fit(X_train_scaled, y_train)" @@ -1885,11 +1550,7 @@ { "cell_type": "code", "execution_count": 60, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "y_pred = rnd_search_cv.best_estimator_.predict(X_train_scaled)\n", @@ -1898,10 +1559,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Ah, this looks good! Let's select this model. Now we can test it on the test set:" ] @@ -1909,11 +1567,7 @@ { "cell_type": "code", "execution_count": 61, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "y_pred = rnd_search_cv.best_estimator_.predict(X_test_scaled)\n", @@ -1922,40 +1576,28 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Not too bad, but apparently the model is overfitting slightly. It's tempting to tweak the hyperparameters a bit more (e.g. decreasing `C` and/or `gamma`), but we would run the risk of overfitting the test set. Other people have found that the hyperparameters `C=5` and `gamma=0.005` yield even better performance (over 98% accuracy). By running the randomized search for longer and on a larger part of the training set, you may be able to find this as well." ] }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "## 10." ] }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "_Exercise: train an SVM regressor on the California housing dataset._" ] }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Let's load the dataset using Scikit-Learn's `fetch_california_housing()` function:" ] @@ -1964,9 +1606,7 @@ "cell_type": "code", "execution_count": 62, "metadata": { - "collapsed": true, - "deletable": true, - "editable": true + "collapsed": true }, "outputs": [], "source": [ @@ -1979,10 +1619,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Split it into a training set and a test set:" ] @@ -1991,9 +1628,7 @@ "cell_type": "code", "execution_count": 63, "metadata": { - "collapsed": true, - "deletable": true, - "editable": true + "collapsed": true }, "outputs": [], "source": [ @@ -2004,10 +1639,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Don't forget to scale the data:" ] @@ -2016,9 +1648,7 @@ "cell_type": "code", "execution_count": 64, "metadata": { - "collapsed": true, - "deletable": true, - "editable": true + "collapsed": true }, "outputs": [], "source": [ @@ -2031,10 +1661,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Let's train a simple `LinearSVR` first:" ] @@ -2042,11 +1669,7 @@ { "cell_type": "code", "execution_count": 65, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.svm import LinearSVR\n", @@ -2057,10 +1680,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Let's see how it performs on the training set:" ] @@ -2068,11 +1688,7 @@ { "cell_type": "code", "execution_count": 66, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.metrics import mean_squared_error\n", @@ -2084,10 +1700,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Let's look at the RMSE:" ] @@ -2095,11 +1708,7 @@ { "cell_type": "code", "execution_count": 67, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "np.sqrt(mse)" @@ -2107,10 +1716,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "In this training set, the targets are tens of thousands of dollars. The RMSE gives a rough idea of the kind of error you should expect (with a higher weight for large errors): so with this model we can expect errors somewhere around $10,000. Not great. Let's see if we can do better with an RBF Kernel. We will use randomized search with cross validation to find the appropriate hyperparameter values for `C` and `gamma`:" ] @@ -2118,11 +1724,7 @@ { "cell_type": "code", "execution_count": 68, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "from sklearn.svm import SVR\n", @@ -2137,11 +1739,7 @@ { "cell_type": "code", "execution_count": 69, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "rnd_search_cv.best_estimator_" @@ -2149,10 +1747,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Now let's measure the RMSE on the training set:" ] @@ -2160,11 +1755,7 @@ { "cell_type": "code", "execution_count": 70, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "y_pred = rnd_search_cv.best_estimator_.predict(X_train_scaled)\n", @@ -2174,10 +1765,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Looks much better than the linear model. Let's select this model and evaluate it on the test set:" ] @@ -2185,11 +1773,7 @@ { "cell_type": "code", "execution_count": 71, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ "y_pred = rnd_search_cv.best_estimator_.predict(X_test_scaled)\n", @@ -2201,9 +1785,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": true, - "deletable": true, - "editable": true + "collapsed": true }, "outputs": [], "source": [] @@ -2225,7 +1807,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.3" + "version": "3.6.3" }, "nav_menu": {}, "toc": { @@ -2239,5 +1821,5 @@ } }, "nbformat": 4, - "nbformat_minor": 0 + "nbformat_minor": 1 } diff --git a/README.md b/README.md index bbec5aa..faa749c 100644 --- a/README.md +++ b/README.md @@ -32,7 +32,11 @@ Of course, you obviously need Python. Python 2 is already preinstalled on most s $ python --version # for Python 2 $ python3 --version # for Python 3 -Any Python 3 version should be fine, preferably ≥3.5. If you don't have Python 3, I recommend installing it (Python ≥2.6 should work, but it is deprecated so Python 3 is preferable). To do so, you have several options: on Windows or MacOSX, you can just download it from [python.org](https://www.python.org/downloads/). On MacOSX, you can alternatively use [MacPorts](https://www.macports.org/) or [Homebrew](https://brew.sh/). On Linux, unless you know what you are doing, you should use your system's packaging system. For example, on Debian or Ubuntu, type: +Any Python 3 version should be fine, preferably ≥3.5. If you don't have Python 3, I recommend installing it (Python ≥2.6 should work, but it is deprecated so Python 3 is preferable). To do so, you have several options: on Windows or MacOSX, you can just download it from [python.org](https://www.python.org/downloads/). On MacOSX, you can alternatively use [MacPorts](https://www.macports.org/) or [Homebrew](https://brew.sh/). If you are using Python 3.6 on MacOSX, you need to run the following command to install the `certifi` package of certificates because Python 3.6 on MacOSX has no certificates to validate SSL connections (see this [StackOverflow question](https://stackoverflow.com/questions/27835619/urllib-and-ssl-certificate-verify-failed-error)): + + $ /Applications/Python\ 3.6/Install\ Certificates.command + +On Linux, unless you know what you are doing, you should use your system's packaging system. For example, on Debian or Ubuntu, type: $ sudo apt-get update $ sudo apt-get install python3 @@ -49,9 +53,9 @@ When using Anaconda, you can optionally create an isolated Python environment de This creates a fresh Python 3.5 environment called `mlbook` (you can change the name if you want to), and it activates it. This environment contains all the scientific libraries that come with Anaconda. This includes all the libraries we will need (NumPy, Matplotlib, Pandas, Jupyter and a few others), except for TensorFlow, so let's install it: - $ conda install -n mlbook -c conda-forge tensorflow=1.0.0 + $ conda install -n mlbook -c conda-forge tensorflow=1.4.0 -This installs TensorFlow 1.0.0 in the `mlbook` environment (fetching it from the `conda-forge` repository). If you chose not to create an `mlbook` environment, then just remove the `-n mlbook` option. +This installs TensorFlow 1.4.0 in the `mlbook` environment (fetching it from the `conda-forge` repository). If you chose not to create an `mlbook` environment, then just remove the `-n mlbook` option. Next, you can optionally install Jupyter extensions. These are useful to have nice tables of contents in the notebooks, but they are not required. @@ -105,3 +109,6 @@ This should open up your browser, and you should see Jupyter's tree view, with t Note: you can also visit [http://localhost:8888/nbextensions](http://localhost:8888/nbextensions) to activate and configure Jupyter extensions. Congrats! You are ready to learn Machine Learning, hands on! + +# Contributors +I would like to thank everyone who contributed to this project, either by providing useful feedback, filing issues or submitting Pull Requests. Special thanks go to Steven Bunkley and Ziembla who created the `docker` directory. diff --git a/docker/.env b/docker/.env new file mode 100644 index 0000000..16adf41 --- /dev/null +++ b/docker/.env @@ -0,0 +1 @@ +COMPOSE_PROJECT_NAME=handson-ml diff --git a/docker/Dockerfile b/docker/Dockerfile index 54e5510..b4ec526 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -1,28 +1,29 @@ FROM continuumio/anaconda3 RUN apt-get update && apt-get upgrade -y \ - && apt-get install -y \ - libpq-dev \ - build-essential \ - git \ - sudo \ - && rm -rf /var/lib/apt/lists/* + && apt-get install -y \ + libpq-dev \ + build-essential \ + git \ + sudo \ + && rm -rf /var/lib/apt/lists/* RUN conda install -y -c conda-forge \ - tensorflow=1.0.0 \ - jupyter_contrib_nbextensions + tensorflow \ + jupyter_contrib_nbextensions ARG username ARG userid +ARG home=/home/${username} +ARG workdir=${home}/handson-ml + RUN adduser ${username} --uid ${userid} --gecos '' --disabled-password \ - && echo "${username} ALL=(root) NOPASSWD:ALL" > /etc/sudoers.d/${username} \ - && chmod 0440 /etc/sudoers.d/${username} + && echo "${username} ALL=(root) NOPASSWD:ALL" > /etc/sudoers.d/${username} \ + && chmod 0440 /etc/sudoers.d/${username} -ENV HOME /home/${username} - -WORKDIR ${HOME}/handson-ml -RUN chown ${username}:${username} ${HOME}/handson-ml +WORKDIR ${workdir} +RUN chown ${username}:${username} ${workdir} USER ${username} @@ -30,6 +31,55 @@ RUN jupyter contrib nbextension install --user RUN jupyter nbextension enable toc2/main +# INFO: Jupyter and nbdime extension are not totally integrated (anaconda image is py36, +# nbdime checks for py35 at the moment, still the config below enables diffing +# notebooks with nbdiff (and nbdiff support in git diff command) after connecting +# to the container by "make exec" (or "docker-compose exec handson-ml bash") +# You may also try running: +# nbd NOTEBOOK_NAME.ipynb +# to get nbdiff between checkpointed version and current version of the given notebook +USER root +WORKDIR / +RUN conda install -y -c conda-forge nbdime +USER ${username} +WORKDIR ${workdir} + +RUN git-nbdiffdriver config --enable --global + +# INFO: Optionally uncomment any (one) of the following RUN commands below to ignore either +# metadata or details in nbdiff within git diff +#RUN git config --global diff.jupyternotebook.command 'git-nbdiffdriver diff --ignore-metadata' +RUN git config --global diff.jupyternotebook.command 'git-nbdiffdriver diff --ignore-details' + + +# INFO: Dirty nbdime patching (ignored if not matching) +COPY docker/nbdime-*.patch /tmp/ +USER root +WORKDIR / +RUN patch -d /opt/conda/lib/python3.6/site-packages -p1 --forward --reject-file=- < \ + /tmp/nbdime-1-details.patch || true \ + && patch -d /opt/conda/lib/python3.6/site-packages -p1 --forward --reject-file=- < \ + /tmp/nbdime-2-toc.patch || true +RUN rm /tmp/nbdime-*.patch +USER ${username} +WORKDIR ${workdir} + + +COPY docker/bashrc.bash /tmp/ +RUN cat /tmp/bashrc.bash >> ${home}/.bashrc +RUN echo "export PATH=\"${workdir}/docker/bin:$PATH\"" >> ${home}/.bashrc +RUN sudo rm /tmp/bashrc.bash + + +# INFO: Uncomment lines below to enable automatic save of python-only and html-only +# exports alongside the notebook +#COPY docker/jupyter_notebook_config.py /tmp/ +#RUN cat /tmp/jupyter_notebook_config.py >> ${home}/.jupyter/jupyter_notebook_config.py +#RUN sudo rm /tmp/jupyter_notebook_config.py + +# INFO: Uncomment the RUN command below to disable git diff paging +#RUN git config --global core.pager '' + # INFO: Uncomment the RUN command below for easy and constant notebook URL (just localhost:8888) # That will switch jupyter to using empty password instead of a token. # To avoid making a security hole you SHOULD in fact not only uncomment but @@ -38,34 +88,6 @@ RUN jupyter nbextension enable toc2/main # from notebook.auth import passwd # passwd() # and take the hash from the output -#RUN mkdir -p ${HOME}/.jupyter && \ -# echo 'c.NotebookApp.password = u"sha1:c6bbcba2d04b:f969e403db876dcfbe26f47affe41909bd53392e"' \ -# >> ${HOME}/.jupyter/jupyter_notebook_config.py - -# INFO: Uncomment the RUN command below to disable git diff paging -#RUN git config --global core.pager '' - - -# INFO: Below - work in progress, nbdime not totally integrated, still it enables diffing -# notebooks with nbdiff (and nbdiff support in git diff command) after connecting to -# the container by "make exec" (docker exec) -# Try: -# nbd NOTEBOOK_NAME.ipynb -# to get nbdiff between checkpointed version and current version of the given notebook -USER root -WORKDIR / - -RUN conda install -y -c conda-forge nbdime - -USER ${username} -WORKDIR ${HOME}/handson-ml - -RUN git-nbdiffdriver config --enable --global - -# INFO: Uncomment the RUN command below to ignore metadata in nbdiff within git diff -#RUN git config --global diff.jupyternotebook.command 'git-nbdiffdriver diff --ignore-metadata' - - -COPY docker/bashrc /tmp/bashrc -RUN cat /tmp/bashrc >> ${HOME}/.bashrc -RUN sudo rm -rf /tmp/bashrc +#RUN mkdir -p ${home}/.jupyter && \ +# echo 'c.NotebookApp.password = u"sha1:c6bbcba2d04b:f969e403db876dcfbe26f47affe41909bd53392e"' \ +# >> ${home}/.jupyter/jupyter_notebook_config.py diff --git a/docker/Makefile b/docker/Makefile index 6078fc9..f85c49a 100644 --- a/docker/Makefile +++ b/docker/Makefile @@ -4,7 +4,7 @@ help: run: docker-compose up exec: - docker-compose exec handson-ml /bin/bash + docker-compose exec handson-ml bash build: stop .FORCE docker-compose build rebuild: stop .FORCE diff --git a/docker/README.md b/docker/README.md index 50b6f12..037ae22 100644 --- a/docker/README.md +++ b/docker/README.md @@ -1,5 +1,5 @@ -# Hands-on Machine Learning in Docker :-) +# Hands-on Machine Learning in Docker This is the Docker configuration which allows you to run and tweak the book's notebooks without installing any dependencies on your machine!
OK, any except `docker`. With `docker-compose`. Well, you may also want `make` (but it is only used as thin layer to call a few simple `docker-compose` commands). @@ -32,7 +32,9 @@ You can close the server just by pressing `Ctrl-C` in terminal window. Run `make exec` (or `docker-compose exec handson-ml bash`) while the server is running to run an additional `bash` shell inside the `handson-ml` container. Now you're inside the environment prepared within the image. -One of the usefull things that can be done there may be comparing versions of the notebooks using the `nbdiff` command if you haven't got `nbdime` installed locally (it is **way** better than plain `diff` for notebooks). See [Tools for diffing and merging of Jupyter notebooks](https://github.com/jupyter/nbdime) for more details. +One of the usefull things that can be done there would be starting TensorBoard (for example with simple `tb` command, see bashrc file). + +Another one may be comparing versions of the notebooks using the `nbdiff` command if you haven't got `nbdime` installed locally (it is **way** better than plain `diff` for notebooks). See [Tools for diffing and merging of Jupyter notebooks](https://github.com/jupyter/nbdime) for more details. You can see changes you made relative to the version in git using `git diff` which is integrated with `nbdiff`. diff --git a/docker/bashrc b/docker/bashrc deleted file mode 100644 index 3535389..0000000 --- a/docker/bashrc +++ /dev/null @@ -1,12 +0,0 @@ -alias ll="ls -l" - -nbd() { - DIRNAME=$(dirname "$1") - BASENAME=$(basename "$1" .ipynb) - - WORKING_COPY=$DIRNAME/$BASENAME.ipynb - CHECKPOINT_COPY=$DIRNAME/.ipynb_checkpoints/$BASENAME-checkpoint.ipynb - - # echo "How change $CHECKPOINT_COPY into $WORKING_COPY" - nbdiff "$CHECKPOINT_COPY" "$WORKING_COPY" -} diff --git a/docker/bashrc.bash b/docker/bashrc.bash new file mode 100644 index 0000000..ff19745 --- /dev/null +++ b/docker/bashrc.bash @@ -0,0 +1,3 @@ +alias ll="ls -alF" +alias nbd="nbdiff_checkpoint" +alias tb="tensorboard --logdir=tf_logs" diff --git a/docker/bin/nbclean_checkpoints b/docker/bin/nbclean_checkpoints new file mode 100755 index 0000000..ba4aaf9 --- /dev/null +++ b/docker/bin/nbclean_checkpoints @@ -0,0 +1,116 @@ +#!/usr/bin/env python + +import collections +import glob +import hashlib +import os +import subprocess + + +class NotebookAnalyser: + + def __init__(self, dry_run=False, verbose=False, colorful=False): + self._dry_run = dry_run + self._verbose = verbose + self._colors = collections.defaultdict(lambda: "") + if colorful: + for color in [ + NotebookAnalyser.COLOR_WHITE, + NotebookAnalyser.COLOR_RED, + NotebookAnalyser.COLOR_GREEN, + NotebookAnalyser.COLOR_YELLOW, + ]: + self._colors[color] = "\033[{}m".format(color) + + NOTEBOOK_SUFFIX = ".ipynb" + CHECKPOINT_DIR = NOTEBOOK_SUFFIX + "_checkpoints" + CHECKPOINT_MASK = "*-checkpoint" + NOTEBOOK_SUFFIX + CHECKPOINT_MASK_LEN = len(CHECKPOINT_MASK) - 1 + + @staticmethod + def get_hash(file_path): + with open(file_path, "rb") as input: + hash = hashlib.md5() + for chunk in iter(lambda: input.read(4096), b""): + hash.update(chunk) + return hash.hexdigest() + + MESSAGE_ORPHANED = "missing " + MESSAGE_MODIFIED = "modified" + MESSAGE_DELETED = "DELETING" + + COLOR_WHITE = "0" + COLOR_RED = "31" + COLOR_GREEN = "32" + COLOR_YELLOW = "33" + + def log(self, message, file, color=COLOR_WHITE): + color_on = self._colors[color] + color_off = self._colors[NotebookAnalyser.COLOR_WHITE] + print("{}{}{}: {}".format(color_on, message, color_off, file)) + + def clean_checkpoints(self, directory): + for checkpoint_path in sorted(glob.glob(os.path.join(directory, NotebookAnalyser.CHECKPOINT_MASK))): + + workfile_dir = os.path.dirname(os.path.dirname(checkpoint_path)) + workfile_name = os.path.basename(checkpoint_path)[:-NotebookAnalyser.CHECKPOINT_MASK_LEN] + NotebookAnalyser.NOTEBOOK_SUFFIX + workfile_path = os.path.join(workfile_dir, workfile_name) + + status = "" + if not os.path.isfile(workfile_path): + if self._verbose: + self.log(NotebookAnalyser.MESSAGE_ORPHANED, workfile_path, NotebookAnalyser.COLOR_RED) + else: + checkpoint_stat = os.stat(checkpoint_path) + workfile_stat = os.stat(workfile_path) + + modified = workfile_stat.st_size != checkpoint_stat.st_size + + if not modified: + checkpoint_hash = NotebookAnalyser.get_hash(checkpoint_path) + workfile_hash = NotebookAnalyser.get_hash(workfile_path) + modified = checkpoint_hash != workfile_hash + + if modified: + if self._verbose: + self.log(NotebookAnalyser.MESSAGE_MODIFIED, workfile_path, NotebookAnalyser.COLOR_YELLOW) + else: + self.log(NotebookAnalyser.MESSAGE_DELETED, checkpoint_path, NotebookAnalyser.COLOR_GREEN) + if not self._dry_run: + os.remove(checkpoint_path) + + if not self._dry_run and not os.listdir(directory): + self.log(NotebookAnalyser.MESSAGE_DELETED, directory, NotebookAnalyser.COLOR_GREEN) + os.rmdir(directory) + + def clean_checkpoints_recursively(self, directory): + for (root, subdirs, files) in os.walk(directory): + subdirs.sort() # INFO: traverse alphabetically + if NotebookAnalyser.CHECKPOINT_DIR in subdirs: + subdirs.remove(NotebookAnalyser.CHECKPOINT_DIR) # INFO: don't recurse there + self.clean_checkpoints(os.path.join(root, NotebookAnalyser.CHECKPOINT_DIR)) + + +def main(): + import argparse + parser = argparse.ArgumentParser(description="Remove checkpointed versions of those jupyter notebooks that are identical to their working copies.", + epilog="""Notebooks will be reported as either + "DELETED" if the working copy and checkpointed version are identical + (checkpoint will be deleted), + "missing" if there is a checkpoint but no corresponding working file can be found + or "modified" if notebook and the checkpoint are not byte-to-byte identical. + If removal of checkpoints results in empty ".ipynb_checkpoints" directory + that directory is also deleted. + """) #, formatter_class=argparse.RawDescriptionHelpFormatter) + parser.add_argument("dirs", metavar="DIR", type=str, nargs="*", default=".", help="directories to search") + parser.add_argument("-d", "--dry-run", action="store_true", help="only print messages, don't perform any removals") + parser.add_argument("-v", "--verbose", action="store_true", help="verbose mode") + parser.add_argument("-c", "--color", action="store_true", help="colorful mode") + args = parser.parse_args() + + analyser = NotebookAnalyser(args.dry_run, args.verbose, args.color) + for directory in args.dirs: + analyser.clean_checkpoints_recursively(directory) + +if __name__ == "__main__": + main() diff --git a/docker/bin/nbdiff_checkpoint b/docker/bin/nbdiff_checkpoint new file mode 100755 index 0000000..9ce7cd0 --- /dev/null +++ b/docker/bin/nbdiff_checkpoint @@ -0,0 +1,17 @@ +#!/bin/bash +if [[ "$#" -lt 1 || "$1" =~ ^((-h)|(--help))$ ]] ; then + echo "usage: nbdiff_checkpoint NOTEBOOK.ipynb" + echo + echo "Show differences between given jupyter notebook and its checkpointed version (in .ipynb_checkpoints subdirectory)" + exit +fi + +DIRNAME=$(dirname "$1") +BASENAME=$(basename "$1" .ipynb) +shift + +WORKING_COPY=$DIRNAME/$BASENAME.ipynb +CHECKPOINT_COPY=$DIRNAME/.ipynb_checkpoints/$BASENAME-checkpoint.ipynb + +echo "----- Analysing how to change $CHECKPOINT_COPY into $WORKING_COPY -----" +nbdiff "$CHECKPOINT_COPY" "$WORKING_COPY" --ignore-details "$@" diff --git a/docker/bin/rm_empty_subdirs b/docker/bin/rm_empty_subdirs new file mode 100755 index 0000000..34f3ea9 --- /dev/null +++ b/docker/bin/rm_empty_subdirs @@ -0,0 +1,54 @@ +#!/usr/bin/env python + +import os + +def remove_empty_directories(initial_dir, + allow_initial_delete=False, ignore_nonexistant_initial=False, + dry_run=False, quiet=False): + + FORBIDDEN_SUBDIRS = set([".git"]) + + if not os.path.isdir(initial_dir) and not ignore_nonexistant_initial: + raise RuntimeError("Initial directory '{}' not found!".format(initial_dir)) + + message = "removed" + if dry_run: + message = "to be " + message + + deleted = set() + + for (directory, subdirs, files) in os.walk(initial_dir, topdown=False): + forbidden = False + parent = directory + while parent: + parent, dirname = os.path.split(parent) + if dirname in FORBIDDEN_SUBDIRS: + forbidden = True + break + if forbidden: + continue + + is_empty = len(files) < 1 and len(set([os.path.join(directory, s) for s in subdirs]) - deleted) < 1 + + if is_empty and (initial_dir != directory or allow_initial_delete): + if not quiet: + print("{}: {}".format(message, directory)) + deleted.add(directory) + if not dry_run: + os.rmdir(directory) + +def main(): + import argparse + parser = argparse.ArgumentParser(description="Remove empty directories recursively in subtree.") + parser.add_argument("dir", metavar="DIR", type=str, nargs="+", help="directory to be searched") + parser.add_argument("-r", "--allow-dir-removal", action="store_true", help="allow deletion of DIR itself") + parser.add_argument("-i", "--ignore-nonexistent-dir", action="store_true", help="don't throw an error if DIR doesn't exist") + parser.add_argument("-d", "--dry-run", action="store_true", help="only print messages, don't perform any removals") + parser.add_argument("-q", "--quiet", action="store_true", help="don't print names of directories being removed") + args = parser.parse_args() + for directory in args.dir: + remove_empty_directories(directory, args.allow_dir_removal, args.ignore_nonexistent_dir, + args.dry_run, args.quiet) + +if __name__ == "__main__": + main() diff --git a/docker/bin/tensorboard b/docker/bin/tensorboard new file mode 100755 index 0000000..dd7294d --- /dev/null +++ b/docker/bin/tensorboard @@ -0,0 +1,2 @@ +#!/bin/bash +python -m tensorboard.main "$@" diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index 8a9718c..d4b46e4 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -15,6 +15,7 @@ services: max-size: 50m ports: - "8888:8888" + - "6006:6006" volumes: - ../:/home/devel/handson-ml command: /opt/conda/bin/jupyter notebook --ip='*' --port=8888 --no-browser diff --git a/docker/jupyter_notebook_config.py b/docker/jupyter_notebook_config.py new file mode 100644 index 0000000..971a49a --- /dev/null +++ b/docker/jupyter_notebook_config.py @@ -0,0 +1,15 @@ +import os +import subprocess + +def export_script_and_view(model, os_path, contents_manager): + if model["type"] != "notebook": + return + dir_name, file_name = os.path.split(os_path) + file_base, file_ext = os.path.splitext(file_name) + if file_base.startswith("Untitled"): + return + export_name = file_base if file_ext == ".ipynb" else file_name + subprocess.check_call(["jupyter", "nbconvert", "--to", "script", file_name, "--output", export_name + "_script"], cwd=dir_name) + subprocess.check_call(["jupyter", "nbconvert", "--to", "html", file_name, "--output", export_name + "_view"], cwd=dir_name) + +c.FileContentsManager.post_save_hook = export_script_and_view diff --git a/docker/nbdime-1-details.patch b/docker/nbdime-1-details.patch new file mode 100644 index 0000000..98f76d6 --- /dev/null +++ b/docker/nbdime-1-details.patch @@ -0,0 +1,17 @@ +--- a/nbdime/diffing/notebooks.py ++++ b/nbdime/diffing/notebooks.py +@@ -548,8 +548,12 @@ def set_notebook_diff_targets(sources=True, outputs=True, attachments=True, meta + metadata_keys = ("/cells/*/metadata", "/metadata", "/cells/*/outputs/*/metadata") + if metadata: + for key in metadata_keys: +- if key in notebook_differs: +- del notebook_differs[key] ++ if details: ++ if key in notebook_differs: ++ del notebook_differs[key] ++ else: ++ notebook_differs[key] = diff_ignore_keys( ++ inner_differ=diff, ignore_keys=['collapsed', 'autoscroll', 'deletable', 'editable']) + else: + for key in metadata_keys: + notebook_differs[key] = diff_ignore diff --git a/docker/nbdime-2-toc.patch b/docker/nbdime-2-toc.patch new file mode 100644 index 0000000..4924e66 --- /dev/null +++ b/docker/nbdime-2-toc.patch @@ -0,0 +1,11 @@ +--- a/nbdime/diffing/notebooks.py ++++ b/nbdime/diffing/notebooks.py +@@ -553,7 +553,7 @@ + del notebook_differs[key] + else: + notebook_differs[key] = diff_ignore_keys( +- inner_differ=diff, ignore_keys=['collapsed', 'autoscroll', 'deletable', 'editable']) ++ inner_differ=diff, ignore_keys=['toc', 'collapsed', 'autoscroll', 'deletable', 'editable']) + else: + for key in metadata_keys: + notebook_differs[key] = diff_ignore