{ "cells": [ { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "**Chapter 8 – Dimensionality Reduction**" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "_This notebook contains all the sample code and solutions to the exercices in chapter 8._" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "# Setup" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "First, let's make sure this notebook works well in both python 2 and 3, import a few common modules, ensure MatplotLib plots figures inline and prepare a function to save the figures:" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true, "deletable": true, "editable": true }, "outputs": [], "source": [ "# To support both python 2 and python 3\n", "from __future__ import division, print_function, unicode_literals\n", "\n", "# Common imports\n", "import numpy as np\n", "import numpy.random as rnd\n", "import os\n", "\n", "# to make this notebook's output stable across runs\n", "rnd.seed(42)\n", "\n", "# To plot pretty figures\n", "%matplotlib inline\n", "import matplotlib\n", "import matplotlib.pyplot as plt\n", "plt.rcParams['axes.labelsize'] = 14\n", "plt.rcParams['xtick.labelsize'] = 12\n", "plt.rcParams['ytick.labelsize'] = 12\n", "\n", "# Where to save the figures\n", "PROJECT_ROOT_DIR = \".\"\n", "CHAPTER_ID = \"dim_reduction\"\n", "\n", "def save_fig(fig_id, tight_layout=True):\n", " path = os.path.join(PROJECT_ROOT_DIR, \"images\", CHAPTER_ID, fig_id + \".png\")\n", " print(\"Saving figure\", fig_id)\n", " if tight_layout:\n", " plt.tight_layout()\n", " plt.savefig(path, format='png', dpi=300)" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "# Projection methods\n", "Build 3D dataset:" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "rnd.seed(4)\n", "m = 60\n", "w1, w2 = 0.1, 0.3\n", "noise = 0.1\n", "\n", "angles = rnd.rand(m) * 3 * np.pi / 2 - 0.5\n", "X = np.empty((m, 3))\n", "X[:, 0] = np.cos(angles) + np.sin(angles)/2 + noise * rnd.randn(m) / 2\n", "X[:, 1] = np.sin(angles) * 0.7 + noise * rnd.randn(m) / 2\n", "X[:, 2] = X[:, 0] * w1 + X[:, 1] * w2 + noise * rnd.randn(m)" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "Mean normalize the data:" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "X = X - X.mean(axis=0)" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "Apply PCA to reduce to 2D." ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "from sklearn.decomposition import PCA\n", "\n", "pca = PCA(n_components = 2)\n", "X2D = pca.fit_transform(X)" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "Recover the 3D points projected on the plane (PCA 2D subspace)." ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": true, "deletable": true, "editable": true }, "outputs": [], "source": [ "X2D_inv = pca.inverse_transform(X2D)" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "Utility class to draw 3D arrows (copied from http://stackoverflow.com/questions/11140163)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": true, "deletable": true, "editable": true }, "outputs": [], "source": [ "from matplotlib.patches import FancyArrowPatch\n", "from mpl_toolkits.mplot3d import proj3d\n", "\n", "class Arrow3D(FancyArrowPatch):\n", " def __init__(self, xs, ys, zs, *args, **kwargs):\n", " FancyArrowPatch.__init__(self, (0,0), (0,0), *args, **kwargs)\n", " self._verts3d = xs, ys, zs\n", "\n", " def draw(self, renderer):\n", " xs3d, ys3d, zs3d = self._verts3d\n", " xs, ys, zs = proj3d.proj_transform(xs3d, ys3d, zs3d, renderer.M)\n", " self.set_positions((xs[0],ys[0]),(xs[1],ys[1]))\n", " FancyArrowPatch.draw(self, renderer)" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "Express the plane as a function of x and y." ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "axes = [-1.8, 1.8, -1.3, 1.3, -1.0, 1.0]\n", "\n", "x1s = np.linspace(axes[0], axes[1], 10)\n", "x2s = np.linspace(axes[2], axes[3], 10)\n", "x1, x2 = np.meshgrid(x1s, x2s)\n", "\n", "C = pca.components_\n", "R = C.T.dot(C)\n", "z = (R[0, 2] * x1 + R[1, 2] * x2) / (1 - R[2, 2])" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "Plot the 3D dataset, the plane and the projections on that plane." ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "from mpl_toolkits.mplot3d import Axes3D\n", "\n", "fig = plt.figure(figsize=(6, 3.8))\n", "ax = fig.add_subplot(111, projection='3d')\n", "\n", "X3D_above = X[X[:, 2] > X2D_inv[:, 2]]\n", "X3D_below = X[X[:, 2] <= X2D_inv[:, 2]]\n", "\n", "ax.plot(X3D_below[:, 0], X3D_below[:, 1], X3D_below[:, 2], \"bo\", alpha=0.5)\n", "\n", "ax.plot_surface(x1, x2, z, alpha=0.2, color=\"k\")\n", "np.linalg.norm(C, axis=0)\n", "ax.add_artist(Arrow3D([0, C[0, 0]],[0, C[0, 1]],[0, C[0, 2]], mutation_scale=15, lw=1, arrowstyle=\"-|>\", color=\"k\"))\n", "ax.add_artist(Arrow3D([0, C[1, 0]],[0, C[1, 1]],[0, C[1, 2]], mutation_scale=15, lw=1, arrowstyle=\"-|>\", color=\"k\"))\n", "ax.plot([0], [0], [0], \"k.\")\n", "\n", "for i in range(m):\n", " if X[i, 2] > X2D_inv[i, 2]:\n", " ax.plot([X[i][0], X2D_inv[i][0]], [X[i][1], X2D_inv[i][1]], [X[i][2], X2D_inv[i][2]], \"k-\")\n", " else:\n", " ax.plot([X[i][0], X2D_inv[i][0]], [X[i][1], X2D_inv[i][1]], [X[i][2], X2D_inv[i][2]], \"k-\", color=\"#505050\")\n", " \n", "ax.plot(X2D_inv[:, 0], X2D_inv[:, 1], X2D_inv[:, 2], \"k+\")\n", "ax.plot(X2D_inv[:, 0], X2D_inv[:, 1], X2D_inv[:, 2], \"k.\")\n", "ax.plot(X3D_above[:, 0], X3D_above[:, 1], X3D_above[:, 2], \"bo\")\n", "ax.set_xlabel(\"$x_1$\", fontsize=18)\n", "ax.set_ylabel(\"$x_2$\", fontsize=18)\n", "ax.set_zlabel(\"$x_3$\", fontsize=18)\n", "ax.set_xlim(axes[0:2])\n", "ax.set_ylim(axes[2:4])\n", "ax.set_zlim(axes[4:6])\n", "\n", "save_fig(\"dataset_3d_plot\")\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "fig = plt.figure()\n", "ax = fig.add_subplot(111, aspect='equal')\n", "\n", "ax.plot(X2D[:, 0], X2D[:, 1], \"k+\")\n", "ax.plot(X2D[:, 0], X2D[:, 1], \"k.\")\n", "ax.plot([0], [0], \"ko\")\n", "ax.arrow(0, 0, 0, 1, head_width=0.05, length_includes_head=True, head_length=0.1, fc='k', ec='k')\n", "ax.arrow(0, 0, 1, 0, head_width=0.05, length_includes_head=True, head_length=0.1, fc='k', ec='k')\n", "ax.set_xlabel(\"$z_1$\", fontsize=18)\n", "ax.set_ylabel(\"$z_2$\", fontsize=18, rotation=0)\n", "ax.axis([-1.5, 1.3, -1.2, 1.2])\n", "ax.grid(True)\n", "save_fig(\"dataset_2d_plot\")" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "PCA using SVD decomposition" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": true, "deletable": true, "editable": true }, "outputs": [], "source": [ "m, n = X.shape\n", "\n", "X_centered = X - X.mean(axis=0)\n", "U, s, V = np.linalg.svd(X_centered)\n", "c1 = V.T[:, 0]\n", "c2 = V.T[:, 1]\n", "\n", "S = np.zeros(X.shape)\n", "S[:n, :n] = np.diag(s)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "np.allclose(X, U.dot(S).dot(V))" ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "T = X.dot(V.T[:, :2])" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "np.allclose(T, U.dot(S)[:, :2])" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "from sklearn.decomposition import PCA\n", "pca = PCA(n_components = 2)\n", "X2D_p = pca.fit_transform(X)\n", "np.allclose(X2D_p, T)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": true, "deletable": true, "editable": true }, "outputs": [], "source": [ "X3D_recover = T.dot(V[:2, :])" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "np.allclose(X3D_recover, pca.inverse_transform(X2D_p))" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "V" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "pca.components_" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "collapsed": true, "deletable": true, "editable": true }, "outputs": [], "source": [ "R = pca.components_.T.dot(pca.components_)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "S[:3]" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "pca.explained_variance_ratio_" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "1 - pca.explained_variance_ratio_.sum()" ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "S[0,0]**2/(S**2).sum(), S[1,1]**2/(S**2).sum()" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "np.sqrt((T[:, 1]**2).sum())" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "# Manifold learning\n", "Swiss roll:" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "collapsed": true, "deletable": true, "editable": true }, "outputs": [], "source": [ "from sklearn.datasets import make_swiss_roll\n", "X, t = make_swiss_roll(n_samples=1000, noise=0.2, random_state=42)" ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "axes = [-11.5, 14, -2, 23, -12, 15]\n", "\n", "fig = plt.figure(figsize=(6, 5))\n", "ax = fig.add_subplot(111, projection='3d')\n", "\n", "ax.scatter(X[:, 0], X[:, 1], X[:, 2], c=t, cmap=plt.cm.hot)\n", "ax.view_init(10, -70)\n", "ax.set_xlabel(\"$x_1$\", fontsize=18)\n", "ax.set_ylabel(\"$x_2$\", fontsize=18)\n", "ax.set_zlabel(\"$x_3$\", fontsize=18)\n", "ax.set_xlim(axes[0:2])\n", "ax.set_ylim(axes[2:4])\n", "ax.set_zlim(axes[4:6])\n", "\n", "save_fig(\"swiss_roll_plot\")\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "plt.figure(figsize=(11, 4))\n", "\n", "plt.subplot(121)\n", "plt.scatter(X[:, 0], X[:, 1], c=t, cmap=plt.cm.hot)\n", "plt.axis(axes[:4])\n", "plt.xlabel(\"$x_1$\", fontsize=18)\n", "plt.ylabel(\"$x_2$\", fontsize=18, rotation=0)\n", "plt.grid(True)\n", "\n", "plt.subplot(122)\n", "plt.scatter(t, X[:, 1], c=t, cmap=plt.cm.hot)\n", "plt.axis([4, 15, axes[2], axes[3]])\n", "plt.xlabel(\"$z_1$\", fontsize=18)\n", "plt.grid(True)\n", "\n", "save_fig(\"squished_swiss_roll_plot\")\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 28, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "from matplotlib import gridspec\n", "\n", "axes = [-11.5, 14, -2, 23, -12, 15]\n", "\n", "x2s = np.linspace(axes[2], axes[3], 10)\n", "x3s = np.linspace(axes[4], axes[5], 10)\n", "x2, x3 = np.meshgrid(x2s, x3s)\n", "\n", "fig = plt.figure(figsize=(6, 5))\n", "ax = plt.subplot(111, projection='3d')\n", "\n", "positive_class = X[:, 0] > 5\n", "X_pos = X[positive_class]\n", "X_neg = X[~positive_class]\n", "ax.view_init(10, -70)\n", "ax.plot(X_neg[:, 0], X_neg[:, 1], X_neg[:, 2], \"y^\")\n", "ax.plot_wireframe(5, x2, x3, alpha=0.5)\n", "ax.plot(X_pos[:, 0], X_pos[:, 1], X_pos[:, 2], \"gs\")\n", "ax.set_xlabel(\"$x_1$\", fontsize=18)\n", "ax.set_ylabel(\"$x_2$\", fontsize=18)\n", "ax.set_zlabel(\"$x_3$\", fontsize=18)\n", "ax.set_xlim(axes[0:2])\n", "ax.set_ylim(axes[2:4])\n", "ax.set_zlim(axes[4:6])\n", "\n", "save_fig(\"manifold_decision_boundary_plot1\")\n", "plt.show()\n", "\n", "fig = plt.figure(figsize=(5, 4))\n", "ax = plt.subplot(111)\n", "\n", "plt.plot(t[positive_class], X[positive_class, 1], \"gs\")\n", "plt.plot(t[~positive_class], X[~positive_class, 1], \"y^\")\n", "plt.axis([4, 15, axes[2], axes[3]])\n", "plt.xlabel(\"$z_1$\", fontsize=18)\n", "plt.ylabel(\"$z_2$\", fontsize=18, rotation=0)\n", "plt.grid(True)\n", "\n", "save_fig(\"manifold_decision_boundary_plot2\")\n", "plt.show()\n", "\n", "fig = plt.figure(figsize=(6, 5))\n", "ax = plt.subplot(111, projection='3d')\n", "\n", "positive_class = 2 * (t[:] - 4) > X[:, 1]\n", "X_pos = X[positive_class]\n", "X_neg = X[~positive_class]\n", "ax.view_init(10, -70)\n", "ax.plot(X_neg[:, 0], X_neg[:, 1], X_neg[:, 2], \"y^\")\n", "ax.plot(X_pos[:, 0], X_pos[:, 1], X_pos[:, 2], \"gs\")\n", "ax.set_xlabel(\"$x_1$\", fontsize=18)\n", "ax.set_ylabel(\"$x_2$\", fontsize=18)\n", "ax.set_zlabel(\"$x_3$\", fontsize=18)\n", "ax.set_xlim(axes[0:2])\n", "ax.set_ylim(axes[2:4])\n", "ax.set_zlim(axes[4:6])\n", "\n", "save_fig(\"manifold_decision_boundary_plot3\")\n", "plt.show()\n", "\n", "fig = plt.figure(figsize=(5, 4))\n", "ax = plt.subplot(111)\n", "\n", "plt.plot(t[positive_class], X[positive_class, 1], \"gs\")\n", "plt.plot(t[~positive_class], X[~positive_class, 1], \"y^\")\n", "plt.plot([4, 15], [0, 22], \"b-\", linewidth=2)\n", "plt.axis([4, 15, axes[2], axes[3]])\n", "plt.xlabel(\"$z_1$\", fontsize=18)\n", "plt.ylabel(\"$z_2$\", fontsize=18, rotation=0)\n", "plt.grid(True)\n", "\n", "save_fig(\"manifold_decision_boundary_plot4\")\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "# PCA" ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "angle = np.pi / 5\n", "stretch = 5\n", "m = 200\n", "\n", "rnd.seed(3)\n", "X = rnd.randn(m, 2) / 10\n", "X = X.dot(np.array([[stretch, 0],[0, 1]])) # stretch\n", "X = X.dot([[np.cos(angle), np.sin(angle)], [-np.sin(angle), np.cos(angle)]]) # rotate\n", "\n", "u1 = np.array([np.cos(angle), np.sin(angle)])\n", "u2 = np.array([np.cos(angle - 2 * np.pi/6), np.sin(angle - 2 * np.pi/6)])\n", "u3 = np.array([np.cos(angle - np.pi/2), np.sin(angle - np.pi/2)])\n", "\n", "X_proj1 = X.dot(u1.reshape(-1, 1))\n", "X_proj2 = X.dot(u2.reshape(-1, 1))\n", "X_proj3 = X.dot(u3.reshape(-1, 1))\n", "\n", "plt.figure(figsize=(8,4))\n", "plt.subplot2grid((3,2), (0, 0), rowspan=3)\n", "plt.plot([-1.4, 1.4], [-1.4*u1[1]/u1[0], 1.4*u1[1]/u1[0]], \"k-\", linewidth=1)\n", "plt.plot([-1.4, 1.4], [-1.4*u2[1]/u2[0], 1.4*u2[1]/u2[0]], \"k--\", linewidth=1)\n", "plt.plot([-1.4, 1.4], [-1.4*u3[1]/u3[0], 1.4*u3[1]/u3[0]], \"k:\", linewidth=2)\n", "plt.plot(X[:, 0], X[:, 1], \"bo\", alpha=0.5)\n", "plt.axis([-1.4, 1.4, -1.4, 1.4])\n", "plt.arrow(0, 0, u1[0], u1[1], head_width=0.1, linewidth=5, length_includes_head=True, head_length=0.1, fc='k', ec='k')\n", "plt.arrow(0, 0, u3[0], u3[1], head_width=0.1, linewidth=5, length_includes_head=True, head_length=0.1, fc='k', ec='k')\n", "plt.text(u1[0] + 0.1, u1[1] - 0.05, r\"$\\mathbf{c_1}$\", fontsize=22)\n", "plt.text(u3[0] + 0.1, u3[1], r\"$\\mathbf{c_2}$\", fontsize=22)\n", "plt.xlabel(\"$x_1$\", fontsize=18)\n", "plt.ylabel(\"$x_2$\", fontsize=18, rotation=0)\n", "plt.grid(True)\n", "\n", "plt.subplot2grid((3,2), (0, 1))\n", "plt.plot([-2, 2], [0, 0], \"k-\", linewidth=1)\n", "plt.plot(X_proj1[:, 0], np.zeros(m), \"bo\", alpha=0.3)\n", "plt.gca().get_yaxis().set_ticks([])\n", "plt.gca().get_xaxis().set_ticklabels([])\n", "plt.axis([-2, 2, -1, 1])\n", "plt.grid(True)\n", "\n", "plt.subplot2grid((3,2), (1, 1))\n", "plt.plot([-2, 2], [0, 0], \"k--\", linewidth=1)\n", "plt.plot(X_proj2[:, 0], np.zeros(m), \"bo\", alpha=0.3)\n", "plt.gca().get_yaxis().set_ticks([])\n", "plt.gca().get_xaxis().set_ticklabels([])\n", "plt.axis([-2, 2, -1, 1])\n", "plt.grid(True)\n", "\n", "plt.subplot2grid((3,2), (2, 1))\n", "plt.plot([-2, 2], [0, 0], \"k:\", linewidth=2)\n", "plt.plot(X_proj3[:, 0], np.zeros(m), \"bo\", alpha=0.3)\n", "plt.gca().get_yaxis().set_ticks([])\n", "plt.axis([-2, 2, -1, 1])\n", "plt.xlabel(\"$z_1$\", fontsize=18)\n", "plt.grid(True)\n", "\n", "save_fig(\"pca_best_projection\")\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "# MNIST compression" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true, "deletable": true, "editable": true }, "outputs": [], "source": [ "from six.moves import urllib\n", "from sklearn.datasets import fetch_mldata\n", "try:\n", " mnist = fetch_mldata('MNIST original')\n", "except urllib.error.HTTPError as ex:\n", " print(\"Could not download MNIST data from mldata.org, trying alternative...\")\n", "\n", " # Alternative method to load MNIST, if mldata.org is down\n", " from scipy.io import loadmat\n", " mnist_alternative_url = \"https://github.com/amplab/datascience-sp14/raw/master/lab7/mldata/mnist-original.mat\"\n", " mnist_path = \"./mnist-original.mat\"\n", " response = urllib.request.urlopen(mnist_alternative_url)\n", " with open(mnist_path, \"wb\") as f:\n", " content = response.read()\n", " f.write(content)\n", " mnist_raw = loadmat(mnist_path)\n", " mnist = {\n", " \"data\": mnist_raw[\"data\"].T,\n", " \"target\": mnist_raw[\"label\"][0],\n", " \"COL_NAMES\": [\"label\", \"data\"],\n", " \"DESCR\": \"mldata.org dataset: mnist-original\",\n", " }\n", " print(\"Success!\")" ] }, { "cell_type": "code", "execution_count": 30, "metadata": { "collapsed": true, "deletable": true, "editable": true }, "outputs": [], "source": [ "from sklearn.model_selection import train_test_split\n", "\n", "X = mnist[\"data\"]\n", "y = mnist[\"target\"]\n", "\n", "X_train, X_test, y_train, y_test = train_test_split(X, y)" ] }, { "cell_type": "code", "execution_count": 31, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "X = X_train\n", "\n", "pca = PCA()\n", "pca.fit(X)\n", "d = np.argmax(np.cumsum(pca.explained_variance_ratio_) >= 0.95) + 1\n", "d" ] }, { "cell_type": "code", "execution_count": 32, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "pca = PCA(n_components=0.95)\n", "X_reduced = pca.fit_transform(X)\n", "pca.n_components_" ] }, { "cell_type": "code", "execution_count": 33, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "np.sum(pca.explained_variance_ratio_)" ] }, { "cell_type": "code", "execution_count": 34, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "X_mnist = X_train\n", "\n", "pca = PCA(n_components = 154)\n", "X_mnist_reduced = pca.fit_transform(X_mnist)\n", "X_mnist_recovered = pca.inverse_transform(X_mnist_reduced)" ] }, { "cell_type": "code", "execution_count": 35, "metadata": { "collapsed": true, "deletable": true, "editable": true }, "outputs": [], "source": [ "def plot_digits(instances, images_per_row=5, **options):\n", " size = 28\n", " images_per_row = min(len(instances), images_per_row)\n", " images = [instance.reshape(size,size) for instance in instances]\n", " n_rows = (len(instances) - 1) // images_per_row + 1\n", " row_images = []\n", " n_empty = n_rows * images_per_row - len(instances)\n", " images.append(np.zeros((size, size * n_empty)))\n", " for row in range(n_rows):\n", " rimages = images[row * images_per_row : (row + 1) * images_per_row]\n", " row_images.append(np.concatenate(rimages, axis=1))\n", " image = np.concatenate(row_images, axis=0)\n", " plt.imshow(image, cmap = matplotlib.cm.binary, **options)\n", " plt.axis(\"off\")" ] }, { "cell_type": "code", "execution_count": 36, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "plt.figure(figsize=(7, 4))\n", "plt.subplot(121)\n", "plot_digits(X_mnist[::2100])\n", "plt.title(\"Original\", fontsize=16)\n", "plt.subplot(122)\n", "plot_digits(X_mnist_recovered[::2100])\n", "plt.title(\"Compressed\", fontsize=16)\n", "\n", "save_fig(\"mnist_compression_plot\")" ] }, { "cell_type": "code", "execution_count": 37, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "from sklearn.decomposition import IncrementalPCA\n", "\n", "n_batches = 100\n", "inc_pca = IncrementalPCA(n_components=154)\n", "for X_batch in np.array_split(X_mnist, n_batches):\n", " print(\".\", end=\"\")\n", " inc_pca.partial_fit(X_batch)\n", "\n", "X_mnist_reduced_inc = inc_pca.transform(X_mnist)" ] }, { "cell_type": "code", "execution_count": 38, "metadata": { "collapsed": true, "deletable": true, "editable": true }, "outputs": [], "source": [ "X_mnist_recovered_inc = inc_pca.inverse_transform(X_mnist_reduced_inc)" ] }, { "cell_type": "code", "execution_count": 39, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "plt.figure(figsize=(7, 4))\n", "plt.subplot(121)\n", "plot_digits(X_mnist[::2100])\n", "plt.subplot(122)\n", "plot_digits(X_mnist_recovered_inc[::2100])\n", "plt.tight_layout()" ] }, { "cell_type": "code", "execution_count": 40, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "np.allclose(pca.mean_, inc_pca.mean_)" ] }, { "cell_type": "code", "execution_count": 41, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "np.allclose(X_mnist_reduced, X_mnist_reduced_inc)" ] }, { "cell_type": "code", "execution_count": 42, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "filename = \"my_mnist.data\"\n", "\n", "X_mm = np.memmap(filename, dtype='float32', mode='write', shape=X_mnist.shape)\n", "X_mm[:] = X_mnist\n", "del X_mm" ] }, { "cell_type": "code", "execution_count": 43, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "X_mm = np.memmap(filename, dtype='float32', mode='readonly', shape=X_mnist.shape)\n", "\n", "batch_size = len(X_mnist) // n_batches\n", "inc_pca = IncrementalPCA(n_components=154, batch_size=batch_size)\n", "inc_pca.fit(X_mm)" ] }, { "cell_type": "code", "execution_count": 44, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "rnd_pca = PCA(n_components=154, random_state=42, svd_solver=\"randomized\")\n", "X_reduced = rnd_pca.fit_transform(X_mnist)" ] }, { "cell_type": "code", "execution_count": 45, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "import time\n", "\n", "for n_components in (2, 10, 154):\n", " print(\"n_components =\", n_components)\n", " regular_pca = PCA(n_components=n_components)\n", " inc_pca = IncrementalPCA(n_components=154, batch_size=500)\n", " rnd_pca = PCA(n_components=154, random_state=42, svd_solver=\"randomized\")\n", "\n", " for pca in (regular_pca, inc_pca, rnd_pca):\n", " t1 = time.time()\n", " pca.fit(X_mnist)\n", " t2 = time.time()\n", " print(pca.__class__.__name__, t2 - t1, \"seconds\")" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "# Kernel PCA" ] }, { "cell_type": "code", "execution_count": 46, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "from sklearn.decomposition import KernelPCA\n", "\n", "X, t = make_swiss_roll(n_samples=1000, noise=0.2, random_state=42)\n", "\n", "lin_pca = KernelPCA(n_components = 2, kernel=\"linear\", fit_inverse_transform=True)\n", "rbf_pca = KernelPCA(n_components = 2, kernel=\"rbf\", gamma=0.0433, fit_inverse_transform=True)\n", "sig_pca = KernelPCA(n_components = 2, kernel=\"sigmoid\", gamma=0.001, coef0=1, fit_inverse_transform=True)\n", "\n", "y = t > 6.9\n", "\n", "plt.figure(figsize=(11, 4))\n", "for subplot, pca, title in ((131, lin_pca, \"Linear kernel\"), (132, rbf_pca, \"RBF kernel, $\\gamma=0.04$\"), (133, sig_pca, \"Sigmoid kernel, $\\gamma=10^{-3}, r=1$\")):\n", " X_reduced = pca.fit_transform(X)\n", " if subplot == 132:\n", " X_reduced_rbf = X_reduced\n", " \n", " plt.subplot(subplot)\n", " #plt.plot(X_reduced[y, 0], X_reduced[y, 1], \"gs\")\n", " #plt.plot(X_reduced[~y, 0], X_reduced[~y, 1], \"y^\")\n", " plt.title(title, fontsize=14)\n", " plt.scatter(X_reduced[:, 0], X_reduced[:, 1], c=t, cmap=plt.cm.hot)\n", " plt.xlabel(\"$z_1$\", fontsize=18)\n", " if subplot == 131:\n", " plt.ylabel(\"$z_2$\", fontsize=18, rotation=0)\n", " plt.grid(True)\n", "\n", "save_fig(\"kernel_pca_plot\")\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 47, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "plt.figure(figsize=(6, 5))\n", "\n", "X_inverse = pca.inverse_transform(X_reduced_rbf)\n", "\n", "ax = plt.subplot(111, projection='3d')\n", "ax.view_init(10, -70)\n", "ax.scatter(X_inverse[:, 0], X_inverse[:, 1], X_inverse[:, 2], c=t, cmap=plt.cm.hot, marker=\"x\")\n", "ax.set_xlabel(\"\")\n", "ax.set_ylabel(\"\")\n", "ax.set_zlabel(\"\")\n", "ax.set_xticklabels([])\n", "ax.set_yticklabels([])\n", "ax.set_zticklabels([])\n", "\n", "save_fig(\"preimage_plot\", tight_layout=False)\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": 48, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "X_reduced = rbf_pca.fit_transform(X)\n", "\n", "plt.figure(figsize=(11, 4))\n", "plt.subplot(132)\n", "plt.scatter(X_reduced[:, 0], X_reduced[:, 1], c=t, cmap=plt.cm.hot, marker=\"x\")\n", "plt.xlabel(\"$z_1$\", fontsize=18)\n", "plt.ylabel(\"$z_2$\", fontsize=18, rotation=0)\n", "plt.grid(True)" ] }, { "cell_type": "code", "execution_count": 49, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "from sklearn.pipeline import Pipeline\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.model_selection import GridSearchCV\n", "\n", "clf = Pipeline([\n", " (\"kpca\", KernelPCA(n_components=2)),\n", " (\"log_reg\", LogisticRegression())\n", " ])\n", "\n", "param_grid = [\n", " {\"kpca__gamma\": np.linspace(0.03, 0.05, 10), \"kpca__kernel\": [\"rbf\", \"sigmoid\"]}\n", " ]\n", "\n", "grid_search = GridSearchCV(clf, param_grid, cv=3)\n", "grid_search.fit(X, y)\n", "grid_search.best_params_" ] }, { "cell_type": "code", "execution_count": 50, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "rbf_pca = KernelPCA(n_components = 2, kernel=\"rbf\", gamma=0.0433,\n", " fit_inverse_transform=True)\n", "X_reduced = rbf_pca.fit_transform(X)\n", "X_preimage = rbf_pca.inverse_transform(X_reduced)" ] }, { "cell_type": "code", "execution_count": 51, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "from sklearn.metrics import mean_squared_error\n", "mean_squared_error(X, X_preimage)" ] }, { "cell_type": "code", "execution_count": 52, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "times_rpca = []\n", "times_pca = []\n", "sizes = [1000, 10000, 20000, 30000, 40000, 50000, 70000, 100000, 200000, 500000]\n", "for n_samples in sizes:\n", " X = rnd.randn(n_samples, 5)\n", " pca = PCA(n_components = 2, random_state=42, svd_solver=\"randomized\")\n", " t1 = time.time()\n", " pca.fit(X)\n", " t2 = time.time()\n", " times_rpca.append(t2 - t1)\n", " pca = PCA(n_components = 2)\n", " t1 = time.time()\n", " pca.fit(X)\n", " t2 = time.time()\n", " times_pca.append(t2 - t1)\n", "\n", "plt.plot(sizes, times_rpca, \"b-o\", label=\"RPCA\")\n", "plt.plot(sizes, times_pca, \"r-s\", label=\"PCA\")\n", "plt.xlabel(\"n_samples\")\n", "plt.ylabel(\"Training time\")\n", "plt.legend(loc=\"upper left\")\n", "plt.title(\"PCA and Randomized PCA time complexity \")" ] }, { "cell_type": "code", "execution_count": 53, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "times_rpca = []\n", "times_pca = []\n", "sizes = [1000, 2000, 3000, 4000, 5000, 6000]\n", "for n_features in sizes:\n", " X = rnd.randn(2000, n_features)\n", " pca = PCA(n_components = 2, random_state=42, svd_solver=\"randomized\")\n", " t1 = time.time()\n", " pca.fit(X)\n", " t2 = time.time()\n", " times_rpca.append(t2 - t1)\n", " pca = PCA(n_components = 2)\n", " t1 = time.time()\n", " pca.fit(X)\n", " t2 = time.time()\n", " times_pca.append(t2 - t1)\n", "\n", "plt.plot(sizes, times_rpca, \"b-o\", label=\"RPCA\")\n", "plt.plot(sizes, times_pca, \"r-s\", label=\"PCA\")\n", "plt.xlabel(\"n_features\")\n", "plt.ylabel(\"Training time\")\n", "plt.legend(loc=\"upper left\")\n", "plt.title(\"PCA and Randomized PCA time complexity \")" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "# LLE" ] }, { "cell_type": "code", "execution_count": 54, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "from sklearn.manifold import LocallyLinearEmbedding\n", "\n", "X, t = make_swiss_roll(n_samples=1000, noise=0.2, random_state=41)\n", "\n", "lle = LocallyLinearEmbedding(n_neighbors=10, n_components=2, random_state=42)\n", "X_reduced = lle.fit_transform(X)\n", "\n", "plt.title(\"Unrolled swiss roll using LLE\", fontsize=14)\n", "plt.scatter(X_reduced[:, 0], X_reduced[:, 1], c=t, cmap=plt.cm.hot)\n", "plt.xlabel(\"$z_1$\", fontsize=18)\n", "plt.ylabel(\"$z_2$\", fontsize=18)\n", "plt.axis([-0.065, 0.055, -0.1, 0.12])\n", "plt.grid(True)\n", "\n", "save_fig(\"lle_unrolling_plot\")\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "# MDS, Isomap and t-SNE" ] }, { "cell_type": "code", "execution_count": 55, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "from sklearn.manifold import MDS\n", "mds = MDS(n_components=2, random_state=42)\n", "X_reduced_mds = mds.fit_transform(X)" ] }, { "cell_type": "code", "execution_count": 56, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "from sklearn.manifold import Isomap\n", "isomap = Isomap(n_components=2)\n", "X_reduced_isomap = isomap.fit_transform(X)" ] }, { "cell_type": "code", "execution_count": 57, "metadata": { "collapsed": true, "deletable": true, "editable": true }, "outputs": [], "source": [ "from sklearn.manifold import TSNE\n", "tsne = TSNE(n_components=2)\n", "X_reduced_tsne = tsne.fit_transform(X)" ] }, { "cell_type": "code", "execution_count": 58, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "from sklearn.discriminant_analysis import LinearDiscriminantAnalysis\n", "lda = LinearDiscriminantAnalysis(n_components=2)\n", "X_mnist = mnist[\"data\"]\n", "y_mnist = mnist[\"target\"]\n", "lda.fit(X_mnist, y_mnist)\n", "X_reduced_lda = lda.transform(X_mnist)" ] }, { "cell_type": "code", "execution_count": 59, "metadata": { "collapsed": false, "deletable": true, "editable": true }, "outputs": [], "source": [ "titles = [\"MDS\", \"Isomap\", \"t-SNE\"]\n", "\n", "plt.figure(figsize=(11,4))\n", "\n", "for subplot, title, X_reduced in zip((131, 132, 133), titles,\n", " (X_reduced_mds, X_reduced_isomap, X_reduced_tsne)):\n", " plt.subplot(subplot)\n", " plt.title(title, fontsize=14)\n", " plt.scatter(X_reduced[:, 0], X_reduced[:, 1], c=t, cmap=plt.cm.hot)\n", " plt.xlabel(\"$z_1$\", fontsize=18)\n", " if subplot == 131:\n", " plt.ylabel(\"$z_2$\", fontsize=18, rotation=0)\n", " plt.grid(True)\n", "\n", "save_fig(\"other_dim_reduction_plot\")\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": { "collapsed": true, "deletable": true, "editable": true }, "source": [ "# Exercise solutions" ] }, { "cell_type": "markdown", "metadata": { "deletable": true, "editable": true }, "source": [ "**Coming soon**" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true, "deletable": true, "editable": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2+" }, "nav_menu": { "height": "352px", "width": "458px" }, "toc": { "navigate_menu": true, "number_sections": true, "sideBar": true, "threshold": 6, "toc_cell": false, "toc_section_display": "block", "toc_window_display": false } }, "nbformat": 4, "nbformat_minor": 0 }