From 3680f6a27c01061717e4846cc7a6f4f7006bdd83 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Geron?= Date: Fri, 2 Jun 2017 16:02:35 +0200 Subject: [PATCH] Add a few explanations in notebook 08, and better sync code with book --- 08_dimensionality_reduction.ipynb | 1219 +++++++++++++++++------------ 1 file changed, 733 insertions(+), 486 deletions(-) diff --git a/08_dimensionality_reduction.ipynb b/08_dimensionality_reduction.ipynb index aac7851..c42e357 100644 --- a/08_dimensionality_reduction.ipynb +++ b/08_dimensionality_reduction.ipynb @@ -121,12 +121,42 @@ "editable": true }, "source": [ - "Mean normalize the data:" + "## PCA using SVD decomposition" ] }, { "cell_type": "code", "execution_count": 3, + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "X_centered = X - X.mean(axis=0)\n", + "U, s, V = np.linalg.svd(X_centered)\n", + "c1 = V.T[:, 0]\n", + "c2 = V.T[:, 1]" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "m, n = X.shape\n", + "\n", + "S = np.zeros(X_centered.shape)\n", + "S[:n, :n] = np.diag(s)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, "metadata": { "collapsed": false, "deletable": true, @@ -134,22 +164,51 @@ }, "outputs": [], "source": [ - "X = X - X.mean(axis=0)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, - "source": [ - "Apply PCA to reduce to 2D." + "np.allclose(X_centered, U.dot(S).dot(V))" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 6, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "W2 = V.T[:, :2]\n", + "X2D = X_centered.dot(W2)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "X2D_using_svd = X2D" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## PCA using Scikit-Learn" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "With Scikit-Learn, PCA is really trivial. It even takes care of mean centering for you:" + ] + }, + { + "cell_type": "code", + "execution_count": 8, "metadata": { "collapsed": false, "deletable": true, @@ -163,6 +222,50 @@ "X2D = pca.fit_transform(X)" ] }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "X2D[:5]" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "X2D_using_svd[:5]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, + "source": [ + "Notice that running PCA multiple times on slightly different datasets may result in different results. In general the only difference is that some axes may be flipped. In this example, PCA using Scikit-Learn gives the same projection as the one given by the SVD approach, except both axes are flipped:" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "np.allclose(X2D, -X2D_using_svd)" + ] + }, { "cell_type": "markdown", "metadata": { @@ -175,7 +278,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 12, "metadata": { "collapsed": true, "deletable": true, @@ -183,7 +286,202 @@ }, "outputs": [], "source": [ - "X2D_inv = pca.inverse_transform(X2D)" + "X3D_inv = pca.inverse_transform(X2D)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Of course, there was some loss of information during the projection step, so the recovered 3D points are not exactly equal to the original 3D points:" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "np.allclose(X3D_inv, X)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can compute the reconstruction error:" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "np.mean(np.sum(np.square(X3D_inv - X), axis=1))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The inverse transform in the SVD approach looks like this:" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "X3D_inv_using_svd = X2D_using_svd.dot(V[:2, :])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The reconstructions from both methods are not identical because Scikit-Learn's `PCA` class automatically takes care of reversing the mean centering, but if we subtract the mean, we get the same reconstruction:" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "np.allclose(X3D_inv_using_svd, X3D_inv - pca.mean_)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The `PCA` object gives access to the principal components that it computed:" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "pca.components_" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Compare to the first two principal components computed using the SVD method:" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "V[:2]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Notice how the axes are flipped." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now let's look at the explained variance ratio:" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "print(pca.explained_variance_ratio_)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The first dimension explains 84.2% of the variance, while the second explains 14.6%." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "By projecting down to 2D, we lost about 1.1% of the variance:" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "1 - pca.explained_variance_ratio_.sum()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here is how to compute the explained variance ratio using the SVD approach (recall that `s` is the diagonal of the matrix `S`):" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "np.square(s) / np.square(s).sum()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next, let's generate some nice figures! :)" ] }, { @@ -198,7 +496,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 22, "metadata": { "collapsed": true, "deletable": true, @@ -233,7 +531,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 23, "metadata": { "collapsed": false, "deletable": true, @@ -264,7 +562,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 24, "metadata": { "collapsed": false, "deletable": true, @@ -277,8 +575,8 @@ "fig = plt.figure(figsize=(6, 3.8))\n", "ax = fig.add_subplot(111, projection='3d')\n", "\n", - "X3D_above = X[X[:, 2] > X2D_inv[:, 2]]\n", - "X3D_below = X[X[:, 2] <= X2D_inv[:, 2]]\n", + "X3D_above = X[X[:, 2] > X3D_inv[:, 2]]\n", + "X3D_below = X[X[:, 2] <= X3D_inv[:, 2]]\n", "\n", "ax.plot(X3D_below[:, 0], X3D_below[:, 1], X3D_below[:, 2], \"bo\", alpha=0.5)\n", "\n", @@ -289,13 +587,13 @@ "ax.plot([0], [0], [0], \"k.\")\n", "\n", "for i in range(m):\n", - " if X[i, 2] > X2D_inv[i, 2]:\n", - " ax.plot([X[i][0], X2D_inv[i][0]], [X[i][1], X2D_inv[i][1]], [X[i][2], X2D_inv[i][2]], \"k-\")\n", + " if X[i, 2] > X3D_inv[i, 2]:\n", + " ax.plot([X[i][0], X3D_inv[i][0]], [X[i][1], X3D_inv[i][1]], [X[i][2], X3D_inv[i][2]], \"k-\")\n", " else:\n", - " ax.plot([X[i][0], X2D_inv[i][0]], [X[i][1], X2D_inv[i][1]], [X[i][2], X2D_inv[i][2]], \"k-\", color=\"#505050\")\n", + " ax.plot([X[i][0], X3D_inv[i][0]], [X[i][1], X3D_inv[i][1]], [X[i][2], X3D_inv[i][2]], \"k-\", color=\"#505050\")\n", " \n", - "ax.plot(X2D_inv[:, 0], X2D_inv[:, 1], X2D_inv[:, 2], \"k+\")\n", - "ax.plot(X2D_inv[:, 0], X2D_inv[:, 1], X2D_inv[:, 2], \"k.\")\n", + "ax.plot(X3D_inv[:, 0], X3D_inv[:, 1], X3D_inv[:, 2], \"k+\")\n", + "ax.plot(X3D_inv[:, 0], X3D_inv[:, 1], X3D_inv[:, 2], \"k.\")\n", "ax.plot(X3D_above[:, 0], X3D_above[:, 1], X3D_above[:, 2], \"bo\")\n", "ax.set_xlabel(\"$x_1$\", fontsize=18)\n", "ax.set_ylabel(\"$x_2$\", fontsize=18)\n", @@ -310,7 +608,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 25, "metadata": { "collapsed": false, "deletable": true, @@ -333,222 +631,6 @@ "save_fig(\"dataset_2d_plot\")" ] }, - { - "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, - "source": [ - "PCA using SVD decomposition" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "collapsed": true, - "deletable": true, - "editable": true - }, - "outputs": [], - "source": [ - "m, n = X.shape\n", - "\n", - "X_centered = X - X.mean(axis=0)\n", - "U, s, V = np.linalg.svd(X_centered)\n", - "c1 = V.T[:, 0]\n", - "c2 = V.T[:, 1]\n", - "\n", - "S = np.zeros(X.shape)\n", - "S[:n, :n] = np.diag(s)" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, - "outputs": [], - "source": [ - "np.allclose(X, U.dot(S).dot(V))" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, - "outputs": [], - "source": [ - "T = X.dot(V.T[:, :2])" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, - "outputs": [], - "source": [ - "np.allclose(T, U.dot(S)[:, :2])" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, - "outputs": [], - "source": [ - "from sklearn.decomposition import PCA\n", - "pca = PCA(n_components = 2)\n", - "X2D_p = pca.fit_transform(X)\n", - "np.allclose(X2D_p, T)" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": { - "collapsed": true, - "deletable": true, - "editable": true - }, - "outputs": [], - "source": [ - "X3D_recover = T.dot(V[:2, :])" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, - "outputs": [], - "source": [ - "np.allclose(X3D_recover, pca.inverse_transform(X2D_p))" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, - "outputs": [], - "source": [ - "V" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, - "outputs": [], - "source": [ - "pca.components_" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": { - "collapsed": true, - "deletable": true, - "editable": true - }, - "outputs": [], - "source": [ - "R = pca.components_.T.dot(pca.components_)" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, - "outputs": [], - "source": [ - "S[:3]" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, - "outputs": [], - "source": [ - "pca.explained_variance_ratio_" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, - "outputs": [], - "source": [ - "1 - pca.explained_variance_ratio_.sum()" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, - "outputs": [], - "source": [ - "S[0,0]**2/(S**2).sum(), S[1,1]**2/(S**2).sum()" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, - "outputs": [], - "source": [ - "np.sqrt((T[:, 1]**2).sum())" - ] - }, { "cell_type": "markdown", "metadata": { @@ -562,7 +644,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 26, "metadata": { "collapsed": true, "deletable": true, @@ -576,7 +658,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 27, "metadata": { "collapsed": false, "deletable": true, @@ -604,7 +686,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 28, "metadata": { "collapsed": false, "deletable": true, @@ -633,7 +715,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 29, "metadata": { "collapsed": false, "deletable": true, @@ -728,7 +810,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 30, "metadata": { "collapsed": false, "deletable": true, @@ -808,7 +890,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 31, "metadata": { "collapsed": true, "deletable": true, @@ -818,32 +900,12 @@ "source": [ "from six.moves import urllib\n", "from sklearn.datasets import fetch_mldata\n", - "try:\n", - " mnist = fetch_mldata('MNIST original')\n", - "except urllib.error.HTTPError as ex:\n", - " print(\"Could not download MNIST data from mldata.org, trying alternative...\")\n", - "\n", - " # Alternative method to load MNIST, if mldata.org is down\n", - " from scipy.io import loadmat\n", - " mnist_alternative_url = \"https://github.com/amplab/datascience-sp14/raw/master/lab7/mldata/mnist-original.mat\"\n", - " mnist_path = \"./mnist-original.mat\"\n", - " response = urllib.request.urlopen(mnist_alternative_url)\n", - " with open(mnist_path, \"wb\") as f:\n", - " content = response.read()\n", - " f.write(content)\n", - " mnist_raw = loadmat(mnist_path)\n", - " mnist = {\n", - " \"data\": mnist_raw[\"data\"].T,\n", - " \"target\": mnist_raw[\"label\"][0],\n", - " \"COL_NAMES\": [\"label\", \"data\"],\n", - " \"DESCR\": \"mldata.org dataset: mnist-original\",\n", - " }\n", - " print(\"Success!\")" + "mnist = fetch_mldata('MNIST original')" ] }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 32, "metadata": { "collapsed": true, "deletable": true, @@ -861,7 +923,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 33, "metadata": { "collapsed": false, "deletable": true, @@ -869,17 +931,26 @@ }, "outputs": [], "source": [ - "X = X_train\n", - "\n", "pca = PCA()\n", - "pca.fit(X)\n", - "d = np.argmax(np.cumsum(pca.explained_variance_ratio_) >= 0.95) + 1\n", + "pca.fit(X_train)\n", + "cumsum = np.cumsum(pca.explained_variance_ratio_)\n", + "d = np.argmax(cumsum >= 0.95) + 1" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ "d" ] }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 35, "metadata": { "collapsed": false, "deletable": true, @@ -888,13 +959,23 @@ "outputs": [], "source": [ "pca = PCA(n_components=0.95)\n", - "X_reduced = pca.fit_transform(X)\n", + "X_reduced = pca.fit_transform(X_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ "pca.n_components_" ] }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 37, "metadata": { "collapsed": false, "deletable": true, @@ -907,7 +988,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 38, "metadata": { "collapsed": false, "deletable": true, @@ -915,16 +996,14 @@ }, "outputs": [], "source": [ - "X_mnist = X_train\n", - "\n", "pca = PCA(n_components = 154)\n", - "X_mnist_reduced = pca.fit_transform(X_mnist)\n", - "X_mnist_recovered = pca.inverse_transform(X_mnist_reduced)" + "X_reduced = pca.fit_transform(X_train)\n", + "X_recovered = pca.inverse_transform(X_reduced)" ] }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 39, "metadata": { "collapsed": true, "deletable": true, @@ -948,79 +1027,6 @@ " plt.axis(\"off\")" ] }, - { - "cell_type": "code", - "execution_count": 36, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, - "outputs": [], - "source": [ - "plt.figure(figsize=(7, 4))\n", - "plt.subplot(121)\n", - "plot_digits(X_mnist[::2100])\n", - "plt.title(\"Original\", fontsize=16)\n", - "plt.subplot(122)\n", - "plot_digits(X_mnist_recovered[::2100])\n", - "plt.title(\"Compressed\", fontsize=16)\n", - "\n", - "save_fig(\"mnist_compression_plot\")" - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, - "outputs": [], - "source": [ - "from sklearn.decomposition import IncrementalPCA\n", - "\n", - "n_batches = 100\n", - "inc_pca = IncrementalPCA(n_components=154)\n", - "for X_batch in np.array_split(X_mnist, n_batches):\n", - " print(\".\", end=\"\")\n", - " inc_pca.partial_fit(X_batch)\n", - "\n", - "X_mnist_reduced_inc = inc_pca.transform(X_mnist)" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "metadata": { - "collapsed": true, - "deletable": true, - "editable": true - }, - "outputs": [], - "source": [ - "X_mnist_recovered_inc = inc_pca.inverse_transform(X_mnist_reduced_inc)" - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, - "outputs": [], - "source": [ - "plt.figure(figsize=(7, 4))\n", - "plt.subplot(121)\n", - "plot_digits(X_mnist[::2100])\n", - "plt.subplot(122)\n", - "plot_digits(X_mnist_recovered_inc[::2100])\n", - "plt.tight_layout()" - ] - }, { "cell_type": "code", "execution_count": 40, @@ -1031,20 +1037,33 @@ }, "outputs": [], "source": [ - "np.allclose(pca.mean_, inc_pca.mean_)" + "plt.figure(figsize=(7, 4))\n", + "plt.subplot(121)\n", + "plot_digits(X_train[::2100])\n", + "plt.title(\"Original\", fontsize=16)\n", + "plt.subplot(122)\n", + "plot_digits(X_recovered[::2100])\n", + "plt.title(\"Compressed\", fontsize=16)\n", + "\n", + "save_fig(\"mnist_compression_plot\")" ] }, { "cell_type": "code", "execution_count": 41, "metadata": { - "collapsed": false, - "deletable": true, - "editable": true + "collapsed": true }, "outputs": [], "source": [ - "np.allclose(X_mnist_reduced, X_mnist_reduced_inc)" + "X_reduced_pca = X_reduced" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Incremental PCA" ] }, { @@ -1057,28 +1076,28 @@ }, "outputs": [], "source": [ - "filename = \"my_mnist.data\"\n", + "from sklearn.decomposition import IncrementalPCA\n", "\n", - "X_mm = np.memmap(filename, dtype='float32', mode='write', shape=X_mnist.shape)\n", - "X_mm[:] = X_mnist\n", - "del X_mm" + "n_batches = 100\n", + "inc_pca = IncrementalPCA(n_components=154)\n", + "for X_batch in np.array_split(X_train, n_batches):\n", + " print(\".\", end=\"\") # not shown in the book\n", + " inc_pca.partial_fit(X_batch)\n", + "\n", + "X_reduced = inc_pca.transform(X_train)" ] }, { "cell_type": "code", "execution_count": 43, "metadata": { - "collapsed": false, + "collapsed": true, "deletable": true, "editable": true }, "outputs": [], "source": [ - "X_mm = np.memmap(filename, dtype='float32', mode='readonly', shape=X_mnist.shape)\n", - "\n", - "batch_size = len(X_mnist) // n_batches\n", - "inc_pca = IncrementalPCA(n_components=154, batch_size=batch_size)\n", - "inc_pca.fit(X_mm)" + "X_recovered_inc_pca = inc_pca.inverse_transform(X_reduced)" ] }, { @@ -1091,43 +1110,30 @@ }, "outputs": [], "source": [ - "rnd_pca = PCA(n_components=154, random_state=42, svd_solver=\"randomized\")\n", - "X_reduced = rnd_pca.fit_transform(X_mnist)" + "plt.figure(figsize=(7, 4))\n", + "plt.subplot(121)\n", + "plot_digits(X_train[::2100])\n", + "plt.subplot(122)\n", + "plot_digits(X_recovered_inc_pca[::2100])\n", + "plt.tight_layout()" ] }, { "cell_type": "code", "execution_count": 45, "metadata": { - "collapsed": false, - "deletable": true, - "editable": true + "collapsed": true }, "outputs": [], "source": [ - "import time\n", - "\n", - "for n_components in (2, 10, 154):\n", - " print(\"n_components =\", n_components)\n", - " regular_pca = PCA(n_components=n_components)\n", - " inc_pca = IncrementalPCA(n_components=154, batch_size=500)\n", - " rnd_pca = PCA(n_components=154, random_state=42, svd_solver=\"randomized\")\n", - "\n", - " for pca in (regular_pca, inc_pca, rnd_pca):\n", - " t1 = time.time()\n", - " pca.fit(X_mnist)\n", - " t2 = time.time()\n", - " print(pca.__class__.__name__, t2 - t1, \"seconds\")" + "X_reduced_inc_pca = X_reduced" ] }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ - "# Kernel PCA" + "Let's compare the results of transforming MNIST using regular PCA and incremental PCA. First, the means are equal: " ] }, { @@ -1140,34 +1146,14 @@ }, "outputs": [], "source": [ - "from sklearn.decomposition import KernelPCA\n", - "\n", - "X, t = make_swiss_roll(n_samples=1000, noise=0.2, random_state=42)\n", - "\n", - "lin_pca = KernelPCA(n_components = 2, kernel=\"linear\", fit_inverse_transform=True)\n", - "rbf_pca = KernelPCA(n_components = 2, kernel=\"rbf\", gamma=0.0433, fit_inverse_transform=True)\n", - "sig_pca = KernelPCA(n_components = 2, kernel=\"sigmoid\", gamma=0.001, coef0=1, fit_inverse_transform=True)\n", - "\n", - "y = t > 6.9\n", - "\n", - "plt.figure(figsize=(11, 4))\n", - "for subplot, pca, title in ((131, lin_pca, \"Linear kernel\"), (132, rbf_pca, \"RBF kernel, $\\gamma=0.04$\"), (133, sig_pca, \"Sigmoid kernel, $\\gamma=10^{-3}, r=1$\")):\n", - " X_reduced = pca.fit_transform(X)\n", - " if subplot == 132:\n", - " X_reduced_rbf = X_reduced\n", - " \n", - " plt.subplot(subplot)\n", - " #plt.plot(X_reduced[y, 0], X_reduced[y, 1], \"gs\")\n", - " #plt.plot(X_reduced[~y, 0], X_reduced[~y, 1], \"y^\")\n", - " plt.title(title, fontsize=14)\n", - " plt.scatter(X_reduced[:, 0], X_reduced[:, 1], c=t, cmap=plt.cm.hot)\n", - " plt.xlabel(\"$z_1$\", fontsize=18)\n", - " if subplot == 131:\n", - " plt.ylabel(\"$z_2$\", fontsize=18, rotation=0)\n", - " plt.grid(True)\n", - "\n", - "save_fig(\"kernel_pca_plot\")\n", - "plt.show()" + "np.allclose(pca.mean_, inc_pca.mean_)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "But the results are not exactly identical. Incremental PCA gives a very good approximate solution, but it's not perfect:" ] }, { @@ -1180,22 +1166,21 @@ }, "outputs": [], "source": [ - "plt.figure(figsize=(6, 5))\n", - "\n", - "X_inverse = pca.inverse_transform(X_reduced_rbf)\n", - "\n", - "ax = plt.subplot(111, projection='3d')\n", - "ax.view_init(10, -70)\n", - "ax.scatter(X_inverse[:, 0], X_inverse[:, 1], X_inverse[:, 2], c=t, cmap=plt.cm.hot, marker=\"x\")\n", - "ax.set_xlabel(\"\")\n", - "ax.set_ylabel(\"\")\n", - "ax.set_zlabel(\"\")\n", - "ax.set_xticklabels([])\n", - "ax.set_yticklabels([])\n", - "ax.set_zticklabels([])\n", - "\n", - "save_fig(\"preimage_plot\", tight_layout=False)\n", - "plt.show()" + "np.allclose(X_reduced_pca, X_reduced_inc_pca)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Using `memmap()`" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's create the `memmap()` structure and copy the MNIST data into it. This would typically be done by a first program:" ] }, { @@ -1208,42 +1193,36 @@ }, "outputs": [], "source": [ - "X_reduced = rbf_pca.fit_transform(X)\n", + "filename = \"my_mnist.data\"\n", + "m, n = X_train.shape\n", "\n", - "plt.figure(figsize=(11, 4))\n", - "plt.subplot(132)\n", - "plt.scatter(X_reduced[:, 0], X_reduced[:, 1], c=t, cmap=plt.cm.hot, marker=\"x\")\n", - "plt.xlabel(\"$z_1$\", fontsize=18)\n", - "plt.ylabel(\"$z_2$\", fontsize=18, rotation=0)\n", - "plt.grid(True)" + "X_mm = np.memmap(filename, dtype='float32', mode='write', shape=(m, n))\n", + "X_mm[:] = X_train" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now deleting the `memmap()` object will trigger its Python finalizer, which ensures that the data is saved to disk." ] }, { "cell_type": "code", "execution_count": 49, "metadata": { - "collapsed": false, - "deletable": true, - "editable": true + "collapsed": false }, "outputs": [], "source": [ - "from sklearn.pipeline import Pipeline\n", - "from sklearn.linear_model import LogisticRegression\n", - "from sklearn.model_selection import GridSearchCV\n", - "\n", - "clf = Pipeline([\n", - " (\"kpca\", KernelPCA(n_components=2)),\n", - " (\"log_reg\", LogisticRegression())\n", - " ])\n", - "\n", - "param_grid = [\n", - " {\"kpca__gamma\": np.linspace(0.03, 0.05, 10), \"kpca__kernel\": [\"rbf\", \"sigmoid\"]}\n", - " ]\n", - "\n", - "grid_search = GridSearchCV(clf, param_grid, cv=3)\n", - "grid_search.fit(X, y)\n", - "grid_search.best_params_" + "del X_mm" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next, another program would load the data and use it for training:" ] }, { @@ -1256,10 +1235,11 @@ }, "outputs": [], "source": [ - "rbf_pca = KernelPCA(n_components = 2, kernel=\"rbf\", gamma=0.0433,\n", - " fit_inverse_transform=True)\n", - "X_reduced = rbf_pca.fit_transform(X)\n", - "X_preimage = rbf_pca.inverse_transform(X_reduced)" + "X_mm = np.memmap(filename, dtype=\"float32\", mode=\"readonly\", shape=(m, n))\n", + "\n", + "batch_size = m // n_batches\n", + "inc_pca = IncrementalPCA(n_components=154, batch_size=batch_size)\n", + "inc_pca.fit(X_mm)" ] }, { @@ -1272,8 +1252,22 @@ }, "outputs": [], "source": [ - "from sklearn.metrics import mean_squared_error\n", - "mean_squared_error(X, X_preimage)" + "rnd_pca = PCA(n_components=154, svd_solver=\"randomized\", random_state=42)\n", + "X_reduced = rnd_pca.fit_transform(X_train)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Time complexity" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's time regular PCA against Incremental PCA and Randomized PCA, for various number of principal components:" ] }, { @@ -1285,13 +1279,45 @@ "editable": true }, "outputs": [], + "source": [ + "import time\n", + "\n", + "for n_components in (2, 10, 154):\n", + " print(\"n_components =\", n_components)\n", + " regular_pca = PCA(n_components=n_components)\n", + " inc_pca = IncrementalPCA(n_components=n_components, batch_size=500)\n", + " rnd_pca = PCA(n_components=n_components, random_state=42, svd_solver=\"randomized\")\n", + "\n", + " for pca in (regular_pca, inc_pca, rnd_pca):\n", + " t1 = time.time()\n", + " pca.fit(X_train)\n", + " t2 = time.time()\n", + " print(\" {}: {:.1f} seconds\".format(pca.__class__.__name__, t2 - t1))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now let's compare PCA and Randomized PCA for datasets of different sizes (number of instances):" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], "source": [ "times_rpca = []\n", "times_pca = []\n", "sizes = [1000, 10000, 20000, 30000, 40000, 50000, 70000, 100000, 200000, 500000]\n", "for n_samples in sizes:\n", " X = rnd.randn(n_samples, 5)\n", - " pca = PCA(n_components = 2, random_state=42, svd_solver=\"randomized\")\n", + " pca = PCA(n_components = 2, svd_solver=\"randomized\", random_state=42)\n", " t1 = time.time()\n", " pca.fit(X)\n", " t2 = time.time()\n", @@ -1310,13 +1336,21 @@ "plt.title(\"PCA and Randomized PCA time complexity \")" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And now let's compare their performance on datasets of 2,000 instances with various numbers of features:" + ] + }, { "cell_type": "code", - "execution_count": 53, + "execution_count": 54, "metadata": { "collapsed": false, "deletable": true, - "editable": true + "editable": true, + "scrolled": true }, "outputs": [], "source": [ @@ -1344,6 +1378,197 @@ "plt.title(\"PCA and Randomized PCA time complexity \")" ] }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "# Kernel PCA" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "X, t = make_swiss_roll(n_samples=1000, noise=0.2, random_state=42)" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "from sklearn.decomposition import KernelPCA\n", + "\n", + "rbf_pca = KernelPCA(n_components = 2, kernel=\"rbf\", gamma=0.04)\n", + "X_reduced = rbf_pca.fit_transform(X)" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "from sklearn.decomposition import KernelPCA\n", + "\n", + "lin_pca = KernelPCA(n_components = 2, kernel=\"linear\", fit_inverse_transform=True)\n", + "rbf_pca = KernelPCA(n_components = 2, kernel=\"rbf\", gamma=0.0433, fit_inverse_transform=True)\n", + "sig_pca = KernelPCA(n_components = 2, kernel=\"sigmoid\", gamma=0.001, coef0=1, fit_inverse_transform=True)\n", + "\n", + "y = t > 6.9\n", + "\n", + "plt.figure(figsize=(11, 4))\n", + "for subplot, pca, title in ((131, lin_pca, \"Linear kernel\"), (132, rbf_pca, \"RBF kernel, $\\gamma=0.04$\"), (133, sig_pca, \"Sigmoid kernel, $\\gamma=10^{-3}, r=1$\")):\n", + " X_reduced = pca.fit_transform(X)\n", + " if subplot == 132:\n", + " X_reduced_rbf = X_reduced\n", + " \n", + " plt.subplot(subplot)\n", + " #plt.plot(X_reduced[y, 0], X_reduced[y, 1], \"gs\")\n", + " #plt.plot(X_reduced[~y, 0], X_reduced[~y, 1], \"y^\")\n", + " plt.title(title, fontsize=14)\n", + " plt.scatter(X_reduced[:, 0], X_reduced[:, 1], c=t, cmap=plt.cm.hot)\n", + " plt.xlabel(\"$z_1$\", fontsize=18)\n", + " if subplot == 131:\n", + " plt.ylabel(\"$z_2$\", fontsize=18, rotation=0)\n", + " plt.grid(True)\n", + "\n", + "save_fig(\"kernel_pca_plot\")\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "plt.figure(figsize=(6, 5))\n", + "\n", + "X_inverse = pca.inverse_transform(X_reduced_rbf)\n", + "\n", + "ax = plt.subplot(111, projection='3d')\n", + "ax.view_init(10, -70)\n", + "ax.scatter(X_inverse[:, 0], X_inverse[:, 1], X_inverse[:, 2], c=t, cmap=plt.cm.hot, marker=\"x\")\n", + "ax.set_xlabel(\"\")\n", + "ax.set_ylabel(\"\")\n", + "ax.set_zlabel(\"\")\n", + "ax.set_xticklabels([])\n", + "ax.set_yticklabels([])\n", + "ax.set_zticklabels([])\n", + "\n", + "save_fig(\"preimage_plot\", tight_layout=False)\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "X_reduced = rbf_pca.fit_transform(X)\n", + "\n", + "plt.figure(figsize=(11, 4))\n", + "plt.subplot(132)\n", + "plt.scatter(X_reduced[:, 0], X_reduced[:, 1], c=t, cmap=plt.cm.hot, marker=\"x\")\n", + "plt.xlabel(\"$z_1$\", fontsize=18)\n", + "plt.ylabel(\"$z_2$\", fontsize=18, rotation=0)\n", + "plt.grid(True)" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "from sklearn.model_selection import GridSearchCV\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.pipeline import Pipeline\n", + "\n", + "clf = Pipeline([\n", + " (\"kpca\", KernelPCA(n_components=2)),\n", + " (\"log_reg\", LogisticRegression())\n", + " ])\n", + "\n", + "param_grid = [{\n", + " \"kpca__gamma\": np.linspace(0.03, 0.05, 10),\n", + " \"kpca__kernel\": [\"rbf\", \"sigmoid\"]\n", + " }]\n", + "\n", + "grid_search = GridSearchCV(clf, param_grid, cv=3)\n", + "grid_search.fit(X, y)" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "print(grid_search.best_params_)" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "rbf_pca = KernelPCA(n_components = 2, kernel=\"rbf\", gamma=0.0433,\n", + " fit_inverse_transform=True)\n", + "X_reduced = rbf_pca.fit_transform(X)\n", + "X_preimage = rbf_pca.inverse_transform(X_reduced)" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "from sklearn.metrics import mean_squared_error\n", + "\n", + "mean_squared_error(X, X_preimage)" + ] + }, { "cell_type": "markdown", "metadata": { @@ -1356,7 +1581,18 @@ }, { "cell_type": "code", - "execution_count": 54, + "execution_count": 64, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "X, t = make_swiss_roll(n_samples=1000, noise=0.2, random_state=41)" + ] + }, + { + "cell_type": "code", + "execution_count": 65, "metadata": { "collapsed": false, "deletable": true, @@ -1366,11 +1602,18 @@ "source": [ "from sklearn.manifold import LocallyLinearEmbedding\n", "\n", - "X, t = make_swiss_roll(n_samples=1000, noise=0.2, random_state=41)\n", - "\n", - "lle = LocallyLinearEmbedding(n_neighbors=10, n_components=2, random_state=42)\n", - "X_reduced = lle.fit_transform(X)\n", - "\n", + "lle = LocallyLinearEmbedding(n_components=2, n_neighbors=10, random_state=42)\n", + "X_reduced = lle.fit_transform(X)" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ "plt.title(\"Unrolled swiss roll using LLE\", fontsize=14)\n", "plt.scatter(X_reduced[:, 0], X_reduced[:, 1], c=t, cmap=plt.cm.hot)\n", "plt.xlabel(\"$z_1$\", fontsize=18)\n", @@ -1394,7 +1637,7 @@ }, { "cell_type": "code", - "execution_count": 55, + "execution_count": 67, "metadata": { "collapsed": false, "deletable": true, @@ -1403,13 +1646,14 @@ "outputs": [], "source": [ "from sklearn.manifold import MDS\n", + "\n", "mds = MDS(n_components=2, random_state=42)\n", "X_reduced_mds = mds.fit_transform(X)" ] }, { "cell_type": "code", - "execution_count": 56, + "execution_count": 68, "metadata": { "collapsed": false, "deletable": true, @@ -1418,13 +1662,14 @@ "outputs": [], "source": [ "from sklearn.manifold import Isomap\n", + "\n", "isomap = Isomap(n_components=2)\n", "X_reduced_isomap = isomap.fit_transform(X)" ] }, { "cell_type": "code", - "execution_count": 57, + "execution_count": 69, "metadata": { "collapsed": true, "deletable": true, @@ -1433,13 +1678,14 @@ "outputs": [], "source": [ "from sklearn.manifold import TSNE\n", + "\n", "tsne = TSNE(n_components=2)\n", "X_reduced_tsne = tsne.fit_transform(X)" ] }, { "cell_type": "code", - "execution_count": 58, + "execution_count": 70, "metadata": { "collapsed": false, "deletable": true, @@ -1448,6 +1694,7 @@ "outputs": [], "source": [ "from sklearn.discriminant_analysis import LinearDiscriminantAnalysis\n", + "\n", "lda = LinearDiscriminantAnalysis(n_components=2)\n", "X_mnist = mnist[\"data\"]\n", "y_mnist = mnist[\"target\"]\n", @@ -1457,7 +1704,7 @@ }, { "cell_type": "code", - "execution_count": 59, + "execution_count": 71, "metadata": { "collapsed": false, "deletable": true, @@ -1532,7 +1779,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.2+" + "version": "3.5.3" }, "nav_menu": { "height": "352px",