handson-ml/08_dimensionality_reduction...

1911 lines
46 KiB
Plaintext
Raw Normal View History

2016-09-27 23:31:21 +02:00
{
"cells": [
{
"cell_type": "markdown",
2017-02-17 11:51:26 +01:00
"metadata": {
"deletable": true,
"editable": true
},
2016-09-27 23:31:21 +02:00
"source": [
"**Chapter 8 Dimensionality Reduction**"
]
},
{
"cell_type": "markdown",
2017-02-17 11:51:26 +01:00
"metadata": {
"deletable": true,
"editable": true
},
2016-09-27 23:31:21 +02:00
"source": [
"_This notebook contains all the sample code and solutions to the exercices in chapter 8._"
]
},
{
"cell_type": "markdown",
2017-02-17 11:51:26 +01:00
"metadata": {
"deletable": true,
"editable": true
},
2016-09-27 23:31:21 +02:00
"source": [
"# Setup"
]
},
{
"cell_type": "markdown",
2017-02-17 11:51:26 +01:00
"metadata": {
"deletable": true,
"editable": true
},
2016-09-27 23:31:21 +02:00
"source": [
"First, let's make sure this notebook works well in both python 2 and 3, import a few common modules, ensure MatplotLib plots figures inline and prepare a function to save the figures:"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
2017-02-17 11:51:26 +01:00
"collapsed": true,
"deletable": true,
"editable": true
2016-09-27 23:31:21 +02:00
},
"outputs": [],
"source": [
"# To support both python 2 and python 3\n",
"from __future__ import division, print_function, unicode_literals\n",
"\n",
"# Common imports\n",
"import numpy as np\n",
"import os\n",
"\n",
"# to make this notebook's output stable across runs\n",
"np.random.seed(42)\n",
2016-09-27 23:31:21 +02:00
"\n",
"# To plot pretty figures\n",
"%matplotlib inline\n",
"import matplotlib\n",
"import matplotlib.pyplot as plt\n",
"plt.rcParams['axes.labelsize'] = 14\n",
"plt.rcParams['xtick.labelsize'] = 12\n",
"plt.rcParams['ytick.labelsize'] = 12\n",
"\n",
"# Where to save the figures\n",
"PROJECT_ROOT_DIR = \".\"\n",
"CHAPTER_ID = \"dim_reduction\"\n",
"\n",
"def save_fig(fig_id, tight_layout=True):\n",
" path = os.path.join(PROJECT_ROOT_DIR, \"images\", CHAPTER_ID, fig_id + \".png\")\n",
" print(\"Saving figure\", fig_id)\n",
" if tight_layout:\n",
" plt.tight_layout()\n",
" plt.savefig(path, format='png', dpi=300)"
]
},
{
"cell_type": "markdown",
2017-02-17 11:51:26 +01:00
"metadata": {
"deletable": true,
"editable": true
},
2016-09-27 23:31:21 +02:00
"source": [
"# Projection methods\n",
"Build 3D dataset:"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
2017-02-17 11:51:26 +01:00
"collapsed": false,
"deletable": true,
"editable": true
2016-09-27 23:31:21 +02:00
},
"outputs": [],
"source": [
"np.random.seed(4)\n",
2016-09-27 23:31:21 +02:00
"m = 60\n",
"w1, w2 = 0.1, 0.3\n",
"noise = 0.1\n",
"\n",
"angles = np.random.rand(m) * 3 * np.pi / 2 - 0.5\n",
2016-09-27 23:31:21 +02:00
"X = np.empty((m, 3))\n",
"X[:, 0] = np.cos(angles) + np.sin(angles)/2 + noise * np.random.randn(m) / 2\n",
"X[:, 1] = np.sin(angles) * 0.7 + noise * np.random.randn(m) / 2\n",
"X[:, 2] = X[:, 0] * w1 + X[:, 1] * w2 + noise * np.random.randn(m)"
2016-09-27 23:31:21 +02:00
]
},
{
"cell_type": "markdown",
2017-02-17 11:51:26 +01:00
"metadata": {
"deletable": true,
"editable": true
},
2016-09-27 23:31:21 +02:00
"source": [
"## PCA using SVD decomposition"
2016-09-27 23:31:21 +02:00
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": true,
2017-02-17 11:51:26 +01:00
"deletable": true,
"editable": true
2016-09-27 23:31:21 +02:00
},
"outputs": [],
"source": [
"X_centered = X - X.mean(axis=0)\n",
"U, s, V = np.linalg.svd(X_centered)\n",
"c1 = V.T[:, 0]\n",
"c2 = V.T[:, 1]"
2016-09-27 23:31:21 +02:00
]
},
{
"cell_type": "code",
"execution_count": 4,
2017-02-17 11:51:26 +01:00
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
2017-02-17 11:51:26 +01:00
},
"outputs": [],
2016-09-27 23:31:21 +02:00
"source": [
"m, n = X.shape\n",
"\n",
"S = np.zeros(X_centered.shape)\n",
"S[:n, :n] = np.diag(s)"
2016-09-27 23:31:21 +02:00
]
},
{
"cell_type": "code",
"execution_count": 5,
2016-09-27 23:31:21 +02:00
"metadata": {
2017-02-17 11:51:26 +01:00
"collapsed": false,
"deletable": true,
"editable": true
2016-09-27 23:31:21 +02:00
},
"outputs": [],
"source": [
"np.allclose(X_centered, U.dot(S).dot(V))"
2016-09-27 23:31:21 +02:00
]
},
{
"cell_type": "code",
"execution_count": 6,
2017-02-17 11:51:26 +01:00
"metadata": {
"collapsed": false,
2017-02-17 11:51:26 +01:00
"deletable": true,
"editable": true
},
"outputs": [],
2016-09-27 23:31:21 +02:00
"source": [
"W2 = V.T[:, :2]\n",
"X2D = X_centered.dot(W2)"
2016-09-27 23:31:21 +02:00
]
},
{
"cell_type": "code",
"execution_count": 7,
2016-09-27 23:31:21 +02:00
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
2016-09-27 23:31:21 +02:00
},
"outputs": [],
"source": [
"X2D_using_svd = X2D"
2016-09-27 23:31:21 +02:00
]
},
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"source": [
"## PCA using Scikit-Learn"
]
},
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"source": [
"With Scikit-Learn, PCA is really trivial. It even takes care of mean centering for you:"
]
},
{
"cell_type": "code",
"execution_count": 8,
2017-02-17 11:51:26 +01:00
"metadata": {
"collapsed": false,
2017-02-17 11:51:26 +01:00
"deletable": true,
"editable": true
},
"outputs": [],
2016-09-27 23:31:21 +02:00
"source": [
"from sklearn.decomposition import PCA\n",
"\n",
"pca = PCA(n_components = 2)\n",
"X2D = pca.fit_transform(X)"
2016-09-27 23:31:21 +02:00
]
},
{
"cell_type": "code",
"execution_count": 9,
2016-09-27 23:31:21 +02:00
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
2016-09-27 23:31:21 +02:00
},
"outputs": [],
"source": [
"X2D[:5]"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"X2D_using_svd[:5]"
2016-09-27 23:31:21 +02:00
]
},
{
"cell_type": "markdown",
2017-02-17 11:51:26 +01:00
"metadata": {
"collapsed": true,
2017-02-17 11:51:26 +01:00
"deletable": true,
"editable": true
},
2016-09-27 23:31:21 +02:00
"source": [
"Notice that running PCA multiple times on slightly different datasets may result in different results. In general the only difference is that some axes may be flipped. In this example, PCA using Scikit-Learn gives the same projection as the one given by the SVD approach, except both axes are flipped:"
2016-09-27 23:31:21 +02:00
]
},
{
"cell_type": "code",
"execution_count": 11,
2016-09-27 23:31:21 +02:00
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
2016-09-27 23:31:21 +02:00
},
"outputs": [],
"source": [
"np.allclose(X2D, -X2D_using_svd)"
2016-09-27 23:31:21 +02:00
]
},
{
"cell_type": "markdown",
2017-02-17 11:51:26 +01:00
"metadata": {
"deletable": true,
"editable": true
},
2016-09-27 23:31:21 +02:00
"source": [
"Recover the 3D points projected on the plane (PCA 2D subspace)."
2016-09-27 23:31:21 +02:00
]
},
{
"cell_type": "code",
"execution_count": 12,
2016-09-27 23:31:21 +02:00
"metadata": {
"collapsed": true,
2017-02-17 11:51:26 +01:00
"deletable": true,
"editable": true
2016-09-27 23:31:21 +02:00
},
"outputs": [],
"source": [
"X3D_inv = pca.inverse_transform(X2D)"
]
},
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"source": [
"Of course, there was some loss of information during the projection step, so the recovered 3D points are not exactly equal to the original 3D points:"
2016-09-27 23:31:21 +02:00
]
},
{
"cell_type": "code",
"execution_count": 13,
2016-09-27 23:31:21 +02:00
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
2016-09-27 23:31:21 +02:00
},
"outputs": [],
"source": [
"np.allclose(X3D_inv, X)"
2016-09-27 23:31:21 +02:00
]
},
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
2016-09-27 23:31:21 +02:00
"source": [
"We can compute the reconstruction error:"
2016-09-27 23:31:21 +02:00
]
},
{
"cell_type": "code",
"execution_count": 14,
2016-09-27 23:31:21 +02:00
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
2016-09-27 23:31:21 +02:00
},
"outputs": [],
"source": [
"np.mean(np.sum(np.square(X3D_inv - X), axis=1))"
]
},
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"source": [
"The inverse transform in the SVD approach looks like this:"
2016-09-27 23:31:21 +02:00
]
},
{
"cell_type": "code",
"execution_count": 15,
2016-09-27 23:31:21 +02:00
"metadata": {
2017-02-17 11:51:26 +01:00
"collapsed": false,
"deletable": true,
"editable": true
2016-09-27 23:31:21 +02:00
},
"outputs": [],
"source": [
"X3D_inv_using_svd = X2D_using_svd.dot(V[:2, :])"
]
},
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"source": [
"The reconstructions from both methods are not identical because Scikit-Learn's `PCA` class automatically takes care of reversing the mean centering, but if we subtract the mean, we get the same reconstruction:"
2016-09-27 23:31:21 +02:00
]
},
{
"cell_type": "code",
"execution_count": 16,
2016-09-27 23:31:21 +02:00
"metadata": {
2017-02-17 11:51:26 +01:00
"collapsed": false,
"deletable": true,
"editable": true
2016-09-27 23:31:21 +02:00
},
"outputs": [],
"source": [
"np.allclose(X3D_inv_using_svd, X3D_inv - pca.mean_)"
]
},
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"source": [
"The `PCA` object gives access to the principal components that it computed:"
2016-09-27 23:31:21 +02:00
]
},
{
"cell_type": "code",
"execution_count": 17,
2016-09-27 23:31:21 +02:00
"metadata": {
2017-02-17 11:51:26 +01:00
"collapsed": false,
"deletable": true,
"editable": true
2016-09-27 23:31:21 +02:00
},
"outputs": [],
"source": [
"pca.components_"
]
},
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"source": [
"Compare to the first two principal components computed using the SVD method:"
2016-09-27 23:31:21 +02:00
]
},
{
"cell_type": "code",
"execution_count": 18,
2016-09-27 23:31:21 +02:00
"metadata": {
2017-02-17 11:51:26 +01:00
"collapsed": false,
"deletable": true,
"editable": true
2016-09-27 23:31:21 +02:00
},
"outputs": [],
"source": [
"V[:2]"
]
},
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"source": [
"Notice how the axes are flipped."
]
},
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"source": [
"Now let's look at the explained variance ratio:"
2016-09-27 23:31:21 +02:00
]
},
{
"cell_type": "code",
"execution_count": 19,
2016-09-27 23:31:21 +02:00
"metadata": {
"collapsed": false,
2017-02-17 11:51:26 +01:00
"deletable": true,
"editable": true
2016-09-27 23:31:21 +02:00
},
"outputs": [],
"source": [
"pca.explained_variance_ratio_"
]
},
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"source": [
"The first dimension explains 84.2% of the variance, while the second explains 14.6%."
]
},
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"source": [
"By projecting down to 2D, we lost about 1.1% of the variance:"
2016-09-27 23:31:21 +02:00
]
},
{
"cell_type": "code",
"execution_count": 20,
2016-09-27 23:31:21 +02:00
"metadata": {
2017-02-17 11:51:26 +01:00
"collapsed": false,
"deletable": true,
"editable": true
2016-09-27 23:31:21 +02:00
},
"outputs": [],
"source": [
"1 - pca.explained_variance_ratio_.sum()"
]
},
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"source": [
"Here is how to compute the explained variance ratio using the SVD approach (recall that `s` is the diagonal of the matrix `S`):"
2016-09-27 23:31:21 +02:00
]
},
{
"cell_type": "code",
"execution_count": 21,
2016-09-27 23:31:21 +02:00
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
2016-09-27 23:31:21 +02:00
},
"outputs": [],
"source": [
"np.square(s) / np.square(s).sum()"
2016-09-27 23:31:21 +02:00
]
},
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"source": [
"Next, let's generate some nice figures! :)"
]
},
{
"cell_type": "markdown",
2016-09-27 23:31:21 +02:00
"metadata": {
2017-02-17 11:51:26 +01:00
"deletable": true,
"editable": true
2016-09-27 23:31:21 +02:00
},
"source": [
"Utility class to draw 3D arrows (copied from http://stackoverflow.com/questions/11140163)"
2016-09-27 23:31:21 +02:00
]
},
{
"cell_type": "code",
"execution_count": 22,
2016-09-27 23:31:21 +02:00
"metadata": {
2017-02-17 11:51:26 +01:00
"collapsed": true,
"deletable": true,
"editable": true
2016-09-27 23:31:21 +02:00
},
"outputs": [],
"source": [
"from matplotlib.patches import FancyArrowPatch\n",
"from mpl_toolkits.mplot3d import proj3d\n",
"\n",
"class Arrow3D(FancyArrowPatch):\n",
" def __init__(self, xs, ys, zs, *args, **kwargs):\n",
" FancyArrowPatch.__init__(self, (0,0), (0,0), *args, **kwargs)\n",
" self._verts3d = xs, ys, zs\n",
"\n",
" def draw(self, renderer):\n",
" xs3d, ys3d, zs3d = self._verts3d\n",
" xs, ys, zs = proj3d.proj_transform(xs3d, ys3d, zs3d, renderer.M)\n",
" self.set_positions((xs[0],ys[0]),(xs[1],ys[1]))\n",
" FancyArrowPatch.draw(self, renderer)"
2016-09-27 23:31:21 +02:00
]
},
{
"cell_type": "markdown",
2016-09-27 23:31:21 +02:00
"metadata": {
2017-02-17 11:51:26 +01:00
"deletable": true,
"editable": true
2016-09-27 23:31:21 +02:00
},
"source": [
"Express the plane as a function of x and y."
2016-09-27 23:31:21 +02:00
]
},
{
"cell_type": "code",
"execution_count": 23,
2016-09-27 23:31:21 +02:00
"metadata": {
2017-02-17 11:51:26 +01:00
"collapsed": false,
"deletable": true,
"editable": true
2016-09-27 23:31:21 +02:00
},
"outputs": [],
"source": [
"axes = [-1.8, 1.8, -1.3, 1.3, -1.0, 1.0]\n",
"\n",
"x1s = np.linspace(axes[0], axes[1], 10)\n",
"x2s = np.linspace(axes[2], axes[3], 10)\n",
"x1, x2 = np.meshgrid(x1s, x2s)\n",
"\n",
"C = pca.components_\n",
"R = C.T.dot(C)\n",
"z = (R[0, 2] * x1 + R[1, 2] * x2) / (1 - R[2, 2])"
2016-09-27 23:31:21 +02:00
]
},
{
"cell_type": "markdown",
2016-09-27 23:31:21 +02:00
"metadata": {
2017-02-17 11:51:26 +01:00
"deletable": true,
"editable": true
2016-09-27 23:31:21 +02:00
},
"source": [
"Plot the 3D dataset, the plane and the projections on that plane."
2016-09-27 23:31:21 +02:00
]
},
{
"cell_type": "code",
"execution_count": 24,
2016-09-27 23:31:21 +02:00
"metadata": {
2017-02-17 11:51:26 +01:00
"collapsed": false,
"deletable": true,
"editable": true
2016-09-27 23:31:21 +02:00
},
"outputs": [],
"source": [
"from mpl_toolkits.mplot3d import Axes3D\n",
"\n",
"fig = plt.figure(figsize=(6, 3.8))\n",
"ax = fig.add_subplot(111, projection='3d')\n",
"\n",
"X3D_above = X[X[:, 2] > X3D_inv[:, 2]]\n",
"X3D_below = X[X[:, 2] <= X3D_inv[:, 2]]\n",
"\n",
"ax.plot(X3D_below[:, 0], X3D_below[:, 1], X3D_below[:, 2], \"bo\", alpha=0.5)\n",
"\n",
"ax.plot_surface(x1, x2, z, alpha=0.2, color=\"k\")\n",
"np.linalg.norm(C, axis=0)\n",
"ax.add_artist(Arrow3D([0, C[0, 0]],[0, C[0, 1]],[0, C[0, 2]], mutation_scale=15, lw=1, arrowstyle=\"-|>\", color=\"k\"))\n",
"ax.add_artist(Arrow3D([0, C[1, 0]],[0, C[1, 1]],[0, C[1, 2]], mutation_scale=15, lw=1, arrowstyle=\"-|>\", color=\"k\"))\n",
"ax.plot([0], [0], [0], \"k.\")\n",
"\n",
"for i in range(m):\n",
" if X[i, 2] > X3D_inv[i, 2]:\n",
" ax.plot([X[i][0], X3D_inv[i][0]], [X[i][1], X3D_inv[i][1]], [X[i][2], X3D_inv[i][2]], \"k-\")\n",
" else:\n",
" ax.plot([X[i][0], X3D_inv[i][0]], [X[i][1], X3D_inv[i][1]], [X[i][2], X3D_inv[i][2]], \"k-\", color=\"#505050\")\n",
" \n",
"ax.plot(X3D_inv[:, 0], X3D_inv[:, 1], X3D_inv[:, 2], \"k+\")\n",
"ax.plot(X3D_inv[:, 0], X3D_inv[:, 1], X3D_inv[:, 2], \"k.\")\n",
"ax.plot(X3D_above[:, 0], X3D_above[:, 1], X3D_above[:, 2], \"bo\")\n",
"ax.set_xlabel(\"$x_1$\", fontsize=18)\n",
"ax.set_ylabel(\"$x_2$\", fontsize=18)\n",
"ax.set_zlabel(\"$x_3$\", fontsize=18)\n",
"ax.set_xlim(axes[0:2])\n",
"ax.set_ylim(axes[2:4])\n",
"ax.set_zlim(axes[4:6])\n",
"\n",
"save_fig(\"dataset_3d_plot\")\n",
"plt.show()"
2016-09-27 23:31:21 +02:00
]
},
{
"cell_type": "code",
"execution_count": 25,
2016-09-27 23:31:21 +02:00
"metadata": {
2017-02-17 11:51:26 +01:00
"collapsed": false,
"deletable": true,
"editable": true
2016-09-27 23:31:21 +02:00
},
"outputs": [],
"source": [
"fig = plt.figure()\n",
"ax = fig.add_subplot(111, aspect='equal')\n",
"\n",
"ax.plot(X2D[:, 0], X2D[:, 1], \"k+\")\n",
"ax.plot(X2D[:, 0], X2D[:, 1], \"k.\")\n",
"ax.plot([0], [0], \"ko\")\n",
"ax.arrow(0, 0, 0, 1, head_width=0.05, length_includes_head=True, head_length=0.1, fc='k', ec='k')\n",
"ax.arrow(0, 0, 1, 0, head_width=0.05, length_includes_head=True, head_length=0.1, fc='k', ec='k')\n",
"ax.set_xlabel(\"$z_1$\", fontsize=18)\n",
"ax.set_ylabel(\"$z_2$\", fontsize=18, rotation=0)\n",
"ax.axis([-1.5, 1.3, -1.2, 1.2])\n",
"ax.grid(True)\n",
"save_fig(\"dataset_2d_plot\")"
2016-09-27 23:31:21 +02:00
]
},
{
"cell_type": "markdown",
2017-02-17 11:51:26 +01:00
"metadata": {
"deletable": true,
"editable": true
},
2016-09-27 23:31:21 +02:00
"source": [
"# Manifold learning\n",
"Swiss roll:"
]
},
{
"cell_type": "code",
"execution_count": 26,
2016-09-27 23:31:21 +02:00
"metadata": {
2017-02-17 11:51:26 +01:00
"collapsed": true,
"deletable": true,
"editable": true
2016-09-27 23:31:21 +02:00
},
"outputs": [],
"source": [
"from sklearn.datasets import make_swiss_roll\n",
"X, t = make_swiss_roll(n_samples=1000, noise=0.2, random_state=42)"
]
},
{
"cell_type": "code",
"execution_count": 27,
2016-09-27 23:31:21 +02:00
"metadata": {
2017-02-17 11:51:26 +01:00
"collapsed": false,
"deletable": true,
"editable": true
2016-09-27 23:31:21 +02:00
},
"outputs": [],
"source": [
"axes = [-11.5, 14, -2, 23, -12, 15]\n",
"\n",
"fig = plt.figure(figsize=(6, 5))\n",
"ax = fig.add_subplot(111, projection='3d')\n",
"\n",
"ax.scatter(X[:, 0], X[:, 1], X[:, 2], c=t, cmap=plt.cm.hot)\n",
"ax.view_init(10, -70)\n",
"ax.set_xlabel(\"$x_1$\", fontsize=18)\n",
"ax.set_ylabel(\"$x_2$\", fontsize=18)\n",
"ax.set_zlabel(\"$x_3$\", fontsize=18)\n",
"ax.set_xlim(axes[0:2])\n",
"ax.set_ylim(axes[2:4])\n",
"ax.set_zlim(axes[4:6])\n",
"\n",
"save_fig(\"swiss_roll_plot\")\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 28,
2016-09-27 23:31:21 +02:00
"metadata": {
2017-02-17 11:51:26 +01:00
"collapsed": false,
"deletable": true,
"editable": true
2016-09-27 23:31:21 +02:00
},
"outputs": [],
"source": [
"plt.figure(figsize=(11, 4))\n",
"\n",
"plt.subplot(121)\n",
"plt.scatter(X[:, 0], X[:, 1], c=t, cmap=plt.cm.hot)\n",
"plt.axis(axes[:4])\n",
"plt.xlabel(\"$x_1$\", fontsize=18)\n",
"plt.ylabel(\"$x_2$\", fontsize=18, rotation=0)\n",
"plt.grid(True)\n",
"\n",
"plt.subplot(122)\n",
"plt.scatter(t, X[:, 1], c=t, cmap=plt.cm.hot)\n",
"plt.axis([4, 15, axes[2], axes[3]])\n",
"plt.xlabel(\"$z_1$\", fontsize=18)\n",
"plt.grid(True)\n",
"\n",
"save_fig(\"squished_swiss_roll_plot\")\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 29,
2016-09-27 23:31:21 +02:00
"metadata": {
2017-02-17 11:51:26 +01:00
"collapsed": false,
"deletable": true,
"editable": true
2016-09-27 23:31:21 +02:00
},
"outputs": [],
"source": [
"from matplotlib import gridspec\n",
"\n",
"axes = [-11.5, 14, -2, 23, -12, 15]\n",
"\n",
"x2s = np.linspace(axes[2], axes[3], 10)\n",
"x3s = np.linspace(axes[4], axes[5], 10)\n",
"x2, x3 = np.meshgrid(x2s, x3s)\n",
"\n",
"fig = plt.figure(figsize=(6, 5))\n",
"ax = plt.subplot(111, projection='3d')\n",
"\n",
"positive_class = X[:, 0] > 5\n",
"X_pos = X[positive_class]\n",
"X_neg = X[~positive_class]\n",
"ax.view_init(10, -70)\n",
"ax.plot(X_neg[:, 0], X_neg[:, 1], X_neg[:, 2], \"y^\")\n",
"ax.plot_wireframe(5, x2, x3, alpha=0.5)\n",
"ax.plot(X_pos[:, 0], X_pos[:, 1], X_pos[:, 2], \"gs\")\n",
"ax.set_xlabel(\"$x_1$\", fontsize=18)\n",
"ax.set_ylabel(\"$x_2$\", fontsize=18)\n",
"ax.set_zlabel(\"$x_3$\", fontsize=18)\n",
"ax.set_xlim(axes[0:2])\n",
"ax.set_ylim(axes[2:4])\n",
"ax.set_zlim(axes[4:6])\n",
"\n",
"save_fig(\"manifold_decision_boundary_plot1\")\n",
"plt.show()\n",
"\n",
"fig = plt.figure(figsize=(5, 4))\n",
"ax = plt.subplot(111)\n",
"\n",
"plt.plot(t[positive_class], X[positive_class, 1], \"gs\")\n",
"plt.plot(t[~positive_class], X[~positive_class, 1], \"y^\")\n",
"plt.axis([4, 15, axes[2], axes[3]])\n",
"plt.xlabel(\"$z_1$\", fontsize=18)\n",
"plt.ylabel(\"$z_2$\", fontsize=18, rotation=0)\n",
"plt.grid(True)\n",
"\n",
"save_fig(\"manifold_decision_boundary_plot2\")\n",
"plt.show()\n",
"\n",
"fig = plt.figure(figsize=(6, 5))\n",
"ax = plt.subplot(111, projection='3d')\n",
"\n",
"positive_class = 2 * (t[:] - 4) > X[:, 1]\n",
"X_pos = X[positive_class]\n",
"X_neg = X[~positive_class]\n",
"ax.view_init(10, -70)\n",
"ax.plot(X_neg[:, 0], X_neg[:, 1], X_neg[:, 2], \"y^\")\n",
"ax.plot(X_pos[:, 0], X_pos[:, 1], X_pos[:, 2], \"gs\")\n",
"ax.set_xlabel(\"$x_1$\", fontsize=18)\n",
"ax.set_ylabel(\"$x_2$\", fontsize=18)\n",
"ax.set_zlabel(\"$x_3$\", fontsize=18)\n",
"ax.set_xlim(axes[0:2])\n",
"ax.set_ylim(axes[2:4])\n",
"ax.set_zlim(axes[4:6])\n",
"\n",
"save_fig(\"manifold_decision_boundary_plot3\")\n",
"plt.show()\n",
"\n",
"fig = plt.figure(figsize=(5, 4))\n",
"ax = plt.subplot(111)\n",
"\n",
"plt.plot(t[positive_class], X[positive_class, 1], \"gs\")\n",
"plt.plot(t[~positive_class], X[~positive_class, 1], \"y^\")\n",
"plt.plot([4, 15], [0, 22], \"b-\", linewidth=2)\n",
"plt.axis([4, 15, axes[2], axes[3]])\n",
"plt.xlabel(\"$z_1$\", fontsize=18)\n",
"plt.ylabel(\"$z_2$\", fontsize=18, rotation=0)\n",
"plt.grid(True)\n",
"\n",
"save_fig(\"manifold_decision_boundary_plot4\")\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
2017-02-17 11:51:26 +01:00
"metadata": {
"deletable": true,
"editable": true
},
2016-09-27 23:31:21 +02:00
"source": [
"# PCA"
]
},
{
"cell_type": "code",
"execution_count": 30,
2016-09-27 23:31:21 +02:00
"metadata": {
2017-02-17 11:51:26 +01:00
"collapsed": false,
"deletable": true,
"editable": true
2016-09-27 23:31:21 +02:00
},
"outputs": [],
"source": [
"angle = np.pi / 5\n",
"stretch = 5\n",
"m = 200\n",
"\n",
"np.random.seed(3)\n",
"X = np.random.randn(m, 2) / 10\n",
2016-09-27 23:31:21 +02:00
"X = X.dot(np.array([[stretch, 0],[0, 1]])) # stretch\n",
"X = X.dot([[np.cos(angle), np.sin(angle)], [-np.sin(angle), np.cos(angle)]]) # rotate\n",
"\n",
"u1 = np.array([np.cos(angle), np.sin(angle)])\n",
"u2 = np.array([np.cos(angle - 2 * np.pi/6), np.sin(angle - 2 * np.pi/6)])\n",
"u3 = np.array([np.cos(angle - np.pi/2), np.sin(angle - np.pi/2)])\n",
"\n",
"X_proj1 = X.dot(u1.reshape(-1, 1))\n",
"X_proj2 = X.dot(u2.reshape(-1, 1))\n",
"X_proj3 = X.dot(u3.reshape(-1, 1))\n",
"\n",
"plt.figure(figsize=(8,4))\n",
"plt.subplot2grid((3,2), (0, 0), rowspan=3)\n",
"plt.plot([-1.4, 1.4], [-1.4*u1[1]/u1[0], 1.4*u1[1]/u1[0]], \"k-\", linewidth=1)\n",
"plt.plot([-1.4, 1.4], [-1.4*u2[1]/u2[0], 1.4*u2[1]/u2[0]], \"k--\", linewidth=1)\n",
"plt.plot([-1.4, 1.4], [-1.4*u3[1]/u3[0], 1.4*u3[1]/u3[0]], \"k:\", linewidth=2)\n",
"plt.plot(X[:, 0], X[:, 1], \"bo\", alpha=0.5)\n",
"plt.axis([-1.4, 1.4, -1.4, 1.4])\n",
"plt.arrow(0, 0, u1[0], u1[1], head_width=0.1, linewidth=5, length_includes_head=True, head_length=0.1, fc='k', ec='k')\n",
"plt.arrow(0, 0, u3[0], u3[1], head_width=0.1, linewidth=5, length_includes_head=True, head_length=0.1, fc='k', ec='k')\n",
"plt.text(u1[0] + 0.1, u1[1] - 0.05, r\"$\\mathbf{c_1}$\", fontsize=22)\n",
"plt.text(u3[0] + 0.1, u3[1], r\"$\\mathbf{c_2}$\", fontsize=22)\n",
"plt.xlabel(\"$x_1$\", fontsize=18)\n",
"plt.ylabel(\"$x_2$\", fontsize=18, rotation=0)\n",
"plt.grid(True)\n",
"\n",
"plt.subplot2grid((3,2), (0, 1))\n",
"plt.plot([-2, 2], [0, 0], \"k-\", linewidth=1)\n",
"plt.plot(X_proj1[:, 0], np.zeros(m), \"bo\", alpha=0.3)\n",
"plt.gca().get_yaxis().set_ticks([])\n",
"plt.gca().get_xaxis().set_ticklabels([])\n",
"plt.axis([-2, 2, -1, 1])\n",
"plt.grid(True)\n",
"\n",
"plt.subplot2grid((3,2), (1, 1))\n",
"plt.plot([-2, 2], [0, 0], \"k--\", linewidth=1)\n",
"plt.plot(X_proj2[:, 0], np.zeros(m), \"bo\", alpha=0.3)\n",
"plt.gca().get_yaxis().set_ticks([])\n",
"plt.gca().get_xaxis().set_ticklabels([])\n",
"plt.axis([-2, 2, -1, 1])\n",
"plt.grid(True)\n",
"\n",
"plt.subplot2grid((3,2), (2, 1))\n",
"plt.plot([-2, 2], [0, 0], \"k:\", linewidth=2)\n",
"plt.plot(X_proj3[:, 0], np.zeros(m), \"bo\", alpha=0.3)\n",
"plt.gca().get_yaxis().set_ticks([])\n",
"plt.axis([-2, 2, -1, 1])\n",
"plt.xlabel(\"$z_1$\", fontsize=18)\n",
"plt.grid(True)\n",
"\n",
"save_fig(\"pca_best_projection\")\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
2017-02-17 11:51:26 +01:00
"metadata": {
"deletable": true,
"editable": true
},
2016-09-27 23:31:21 +02:00
"source": [
"# MNIST compression"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"from six.moves import urllib\n",
"from sklearn.datasets import fetch_mldata\n",
"mnist = fetch_mldata('MNIST original')"
]
},
2016-09-27 23:31:21 +02:00
{
"cell_type": "code",
"execution_count": 32,
2016-09-27 23:31:21 +02:00
"metadata": {
2017-02-17 11:51:26 +01:00
"collapsed": true,
"deletable": true,
"editable": true
2016-09-27 23:31:21 +02:00
},
"outputs": [],
"source": [
2016-11-05 14:26:57 +01:00
"from sklearn.model_selection import train_test_split\n",
2016-09-27 23:31:21 +02:00
"\n",
"X = mnist[\"data\"]\n",
"y = mnist[\"target\"]\n",
"\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y)"
]
},
{
"cell_type": "code",
"execution_count": 33,
2016-09-27 23:31:21 +02:00
"metadata": {
2017-02-17 11:51:26 +01:00
"collapsed": false,
"deletable": true,
"editable": true
2016-09-27 23:31:21 +02:00
},
"outputs": [],
"source": [
"pca = PCA()\n",
"pca.fit(X_train)\n",
"cumsum = np.cumsum(pca.explained_variance_ratio_)\n",
"d = np.argmax(cumsum >= 0.95) + 1"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
2016-09-27 23:31:21 +02:00
"d"
]
},
{
"cell_type": "code",
"execution_count": 35,
2016-09-27 23:31:21 +02:00
"metadata": {
2017-02-17 11:51:26 +01:00
"collapsed": false,
"deletable": true,
"editable": true
2016-09-27 23:31:21 +02:00
},
"outputs": [],
"source": [
"pca = PCA(n_components=0.95)\n",
"X_reduced = pca.fit_transform(X_train)"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
2016-09-27 23:31:21 +02:00
"pca.n_components_"
]
},
{
"cell_type": "code",
"execution_count": 37,
2016-09-27 23:31:21 +02:00
"metadata": {
2017-02-17 11:51:26 +01:00
"collapsed": false,
"deletable": true,
"editable": true
2016-09-27 23:31:21 +02:00
},
"outputs": [],
"source": [
"np.sum(pca.explained_variance_ratio_)"
]
},
{
"cell_type": "code",
"execution_count": 38,
2016-09-27 23:31:21 +02:00
"metadata": {
2017-02-17 11:51:26 +01:00
"collapsed": false,
"deletable": true,
"editable": true
2016-09-27 23:31:21 +02:00
},
"outputs": [],
"source": [
"pca = PCA(n_components = 154)\n",
"X_reduced = pca.fit_transform(X_train)\n",
"X_recovered = pca.inverse_transform(X_reduced)"
2016-09-27 23:31:21 +02:00
]
},
{
"cell_type": "code",
"execution_count": 39,
2016-09-27 23:31:21 +02:00
"metadata": {
2017-02-17 11:51:26 +01:00
"collapsed": true,
"deletable": true,
"editable": true
2016-09-27 23:31:21 +02:00
},
"outputs": [],
"source": [
"def plot_digits(instances, images_per_row=5, **options):\n",
" size = 28\n",
" images_per_row = min(len(instances), images_per_row)\n",
" images = [instance.reshape(size,size) for instance in instances]\n",
" n_rows = (len(instances) - 1) // images_per_row + 1\n",
" row_images = []\n",
" n_empty = n_rows * images_per_row - len(instances)\n",
" images.append(np.zeros((size, size * n_empty)))\n",
" for row in range(n_rows):\n",
" rimages = images[row * images_per_row : (row + 1) * images_per_row]\n",
" row_images.append(np.concatenate(rimages, axis=1))\n",
" image = np.concatenate(row_images, axis=0)\n",
" plt.imshow(image, cmap = matplotlib.cm.binary, **options)\n",
" plt.axis(\"off\")"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"plt.figure(figsize=(7, 4))\n",
"plt.subplot(121)\n",
"plot_digits(X_train[::2100])\n",
"plt.title(\"Original\", fontsize=16)\n",
"plt.subplot(122)\n",
"plot_digits(X_recovered[::2100])\n",
"plt.title(\"Compressed\", fontsize=16)\n",
"\n",
"save_fig(\"mnist_compression_plot\")"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"X_reduced_pca = X_reduced"
]
},
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"source": [
"## Incremental PCA"
]
},
{
"cell_type": "code",
"execution_count": 42,
2016-09-27 23:31:21 +02:00
"metadata": {
2017-02-17 11:51:26 +01:00
"collapsed": false,
"deletable": true,
"editable": true
2016-09-27 23:31:21 +02:00
},
"outputs": [],
"source": [
"from sklearn.decomposition import IncrementalPCA\n",
"\n",
"n_batches = 100\n",
"inc_pca = IncrementalPCA(n_components=154)\n",
"for X_batch in np.array_split(X_train, n_batches):\n",
" print(\".\", end=\"\") # not shown in the book\n",
" inc_pca.partial_fit(X_batch)\n",
"\n",
"X_reduced = inc_pca.transform(X_train)"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"X_recovered_inc_pca = inc_pca.inverse_transform(X_reduced)"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"plt.figure(figsize=(7, 4))\n",
"plt.subplot(121)\n",
"plot_digits(X_train[::2100])\n",
"plt.subplot(122)\n",
"plot_digits(X_recovered_inc_pca[::2100])\n",
"plt.tight_layout()"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"X_reduced_inc_pca = X_reduced"
]
},
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"source": [
"Let's compare the results of transforming MNIST using regular PCA and incremental PCA. First, the means are equal: "
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"np.allclose(pca.mean_, inc_pca.mean_)"
]
},
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"source": [
"But the results are not exactly identical. Incremental PCA gives a very good approximate solution, but it's not perfect:"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"np.allclose(X_reduced_pca, X_reduced_inc_pca)"
]
},
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"source": [
"### Using `memmap()`"
]
},
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"source": [
"Let's create the `memmap()` structure and copy the MNIST data into it. This would typically be done by a first program:"
2016-09-27 23:31:21 +02:00
]
},
{
"cell_type": "code",
"execution_count": 48,
2016-09-27 23:31:21 +02:00
"metadata": {
2017-02-17 11:51:26 +01:00
"collapsed": false,
"deletable": true,
"editable": true
2016-09-27 23:31:21 +02:00
},
"outputs": [],
"source": [
"filename = \"my_mnist.data\"\n",
"m, n = X_train.shape\n",
2016-09-27 23:31:21 +02:00
"\n",
"X_mm = np.memmap(filename, dtype='float32', mode='write', shape=(m, n))\n",
"X_mm[:] = X_train"
]
},
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"source": [
"Now deleting the `memmap()` object will trigger its Python finalizer, which ensures that the data is saved to disk."
2016-09-27 23:31:21 +02:00
]
},
{
"cell_type": "code",
"execution_count": 49,
2016-09-27 23:31:21 +02:00
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
2016-09-27 23:31:21 +02:00
},
"outputs": [],
"source": [
"del X_mm"
]
},
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"source": [
"Next, another program would load the data and use it for training:"
2016-09-27 23:31:21 +02:00
]
},
{
"cell_type": "code",
"execution_count": 50,
2016-09-27 23:31:21 +02:00
"metadata": {
2017-02-17 11:51:26 +01:00
"collapsed": false,
"deletable": true,
"editable": true
2016-09-27 23:31:21 +02:00
},
"outputs": [],
"source": [
"X_mm = np.memmap(filename, dtype=\"float32\", mode=\"readonly\", shape=(m, n))\n",
"\n",
"batch_size = m // n_batches\n",
"inc_pca = IncrementalPCA(n_components=154, batch_size=batch_size)\n",
"inc_pca.fit(X_mm)"
2016-09-27 23:31:21 +02:00
]
},
{
"cell_type": "code",
"execution_count": 51,
2016-09-27 23:31:21 +02:00
"metadata": {
2017-02-17 11:51:26 +01:00
"collapsed": false,
"deletable": true,
"editable": true
2016-09-27 23:31:21 +02:00
},
"outputs": [],
"source": [
"rnd_pca = PCA(n_components=154, svd_solver=\"randomized\", random_state=42)\n",
"X_reduced = rnd_pca.fit_transform(X_train)"
]
},
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"source": [
"## Time complexity"
]
},
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"source": [
"Let's time regular PCA against Incremental PCA and Randomized PCA, for various number of principal components:"
2016-09-27 23:31:21 +02:00
]
},
{
"cell_type": "code",
"execution_count": 52,
2016-09-27 23:31:21 +02:00
"metadata": {
2017-02-17 11:51:26 +01:00
"collapsed": false,
"deletable": true,
"editable": true
2016-09-27 23:31:21 +02:00
},
"outputs": [],
"source": [
"import time\n",
"\n",
"for n_components in (2, 10, 154):\n",
" print(\"n_components =\", n_components)\n",
" regular_pca = PCA(n_components=n_components)\n",
" inc_pca = IncrementalPCA(n_components=n_components, batch_size=500)\n",
" rnd_pca = PCA(n_components=n_components, random_state=42, svd_solver=\"randomized\")\n",
"\n",
" for pca in (regular_pca, inc_pca, rnd_pca):\n",
" t1 = time.time()\n",
" pca.fit(X_train)\n",
" t2 = time.time()\n",
" print(\" {}: {:.1f} seconds\".format(pca.__class__.__name__, t2 - t1))"
]
},
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"source": [
"Now let's compare PCA and Randomized PCA for datasets of different sizes (number of instances):"
2016-09-27 23:31:21 +02:00
]
},
{
"cell_type": "code",
"execution_count": 53,
2016-09-27 23:31:21 +02:00
"metadata": {
2017-02-17 11:51:26 +01:00
"collapsed": false,
"deletable": true,
"editable": true
2016-09-27 23:31:21 +02:00
},
"outputs": [],
"source": [
"times_rpca = []\n",
"times_pca = []\n",
"sizes = [1000, 10000, 20000, 30000, 40000, 50000, 70000, 100000, 200000, 500000]\n",
"for n_samples in sizes:\n",
" X = np.random.randn(n_samples, 5)\n",
" pca = PCA(n_components = 2, svd_solver=\"randomized\", random_state=42)\n",
" t1 = time.time()\n",
" pca.fit(X)\n",
" t2 = time.time()\n",
" times_rpca.append(t2 - t1)\n",
" pca = PCA(n_components = 2)\n",
" t1 = time.time()\n",
" pca.fit(X)\n",
" t2 = time.time()\n",
" times_pca.append(t2 - t1)\n",
2016-09-27 23:31:21 +02:00
"\n",
"plt.plot(sizes, times_rpca, \"b-o\", label=\"RPCA\")\n",
"plt.plot(sizes, times_pca, \"r-s\", label=\"PCA\")\n",
"plt.xlabel(\"n_samples\")\n",
"plt.ylabel(\"Training time\")\n",
"plt.legend(loc=\"upper left\")\n",
"plt.title(\"PCA and Randomized PCA time complexity \")"
]
},
{
"cell_type": "markdown",
"metadata": {
"deletable": true,
"editable": true
},
"source": [
"And now let's compare their performance on datasets of 2,000 instances with various numbers of features:"
2016-09-27 23:31:21 +02:00
]
},
{
"cell_type": "code",
"execution_count": 54,
2016-09-27 23:31:21 +02:00
"metadata": {
2017-02-17 11:51:26 +01:00
"collapsed": false,
"deletable": true,
"editable": true,
"scrolled": true
2016-09-27 23:31:21 +02:00
},
"outputs": [],
"source": [
"times_rpca = []\n",
"times_pca = []\n",
"sizes = [1000, 2000, 3000, 4000, 5000, 6000]\n",
"for n_features in sizes:\n",
" X = np.random.randn(2000, n_features)\n",
" pca = PCA(n_components = 2, random_state=42, svd_solver=\"randomized\")\n",
" t1 = time.time()\n",
" pca.fit(X)\n",
" t2 = time.time()\n",
" times_rpca.append(t2 - t1)\n",
" pca = PCA(n_components = 2)\n",
" t1 = time.time()\n",
" pca.fit(X)\n",
" t2 = time.time()\n",
" times_pca.append(t2 - t1)\n",
2016-09-27 23:31:21 +02:00
"\n",
"plt.plot(sizes, times_rpca, \"b-o\", label=\"RPCA\")\n",
"plt.plot(sizes, times_pca, \"r-s\", label=\"PCA\")\n",
"plt.xlabel(\"n_features\")\n",
"plt.ylabel(\"Training time\")\n",
"plt.legend(loc=\"upper left\")\n",
"plt.title(\"PCA and Randomized PCA time complexity \")"
2016-09-27 23:31:21 +02:00
]
},
{
"cell_type": "markdown",
2016-09-27 23:31:21 +02:00
"metadata": {
2017-02-17 11:51:26 +01:00
"deletable": true,
"editable": true
2016-09-27 23:31:21 +02:00
},
"source": [
"# Kernel PCA"
2016-09-27 23:31:21 +02:00
]
},
{
"cell_type": "code",
"execution_count": 55,
2016-09-27 23:31:21 +02:00
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
2016-09-27 23:31:21 +02:00
},
"outputs": [],
"source": [
"X, t = make_swiss_roll(n_samples=1000, noise=0.2, random_state=42)"
2016-09-27 23:31:21 +02:00
]
},
{
"cell_type": "code",
"execution_count": 56,
2017-02-17 11:51:26 +01:00
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
2017-02-17 11:51:26 +01:00
},
"outputs": [],
2016-09-27 23:31:21 +02:00
"source": [
"from sklearn.decomposition import KernelPCA\n",
"\n",
"rbf_pca = KernelPCA(n_components = 2, kernel=\"rbf\", gamma=0.04)\n",
"X_reduced = rbf_pca.fit_transform(X)"
2016-09-27 23:31:21 +02:00
]
},
{
"cell_type": "code",
"execution_count": 57,
2016-09-27 23:31:21 +02:00
"metadata": {
2017-02-17 11:51:26 +01:00
"collapsed": false,
"deletable": true,
"editable": true
2016-09-27 23:31:21 +02:00
},
"outputs": [],
"source": [
"from sklearn.decomposition import KernelPCA\n",
"\n",
"lin_pca = KernelPCA(n_components = 2, kernel=\"linear\", fit_inverse_transform=True)\n",
"rbf_pca = KernelPCA(n_components = 2, kernel=\"rbf\", gamma=0.0433, fit_inverse_transform=True)\n",
"sig_pca = KernelPCA(n_components = 2, kernel=\"sigmoid\", gamma=0.001, coef0=1, fit_inverse_transform=True)\n",
"\n",
"y = t > 6.9\n",
"\n",
"plt.figure(figsize=(11, 4))\n",
"for subplot, pca, title in ((131, lin_pca, \"Linear kernel\"), (132, rbf_pca, \"RBF kernel, $\\gamma=0.04$\"), (133, sig_pca, \"Sigmoid kernel, $\\gamma=10^{-3}, r=1$\")):\n",
" X_reduced = pca.fit_transform(X)\n",
" if subplot == 132:\n",
" X_reduced_rbf = X_reduced\n",
" \n",
" plt.subplot(subplot)\n",
" #plt.plot(X_reduced[y, 0], X_reduced[y, 1], \"gs\")\n",
" #plt.plot(X_reduced[~y, 0], X_reduced[~y, 1], \"y^\")\n",
" plt.title(title, fontsize=14)\n",
" plt.scatter(X_reduced[:, 0], X_reduced[:, 1], c=t, cmap=plt.cm.hot)\n",
" plt.xlabel(\"$z_1$\", fontsize=18)\n",
" if subplot == 131:\n",
" plt.ylabel(\"$z_2$\", fontsize=18, rotation=0)\n",
" plt.grid(True)\n",
"\n",
"save_fig(\"kernel_pca_plot\")\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 58,
2016-09-27 23:31:21 +02:00
"metadata": {
2017-02-17 11:51:26 +01:00
"collapsed": false,
"deletable": true,
"editable": true
2016-09-27 23:31:21 +02:00
},
"outputs": [],
"source": [
"plt.figure(figsize=(6, 5))\n",
"\n",
"X_inverse = pca.inverse_transform(X_reduced_rbf)\n",
"\n",
"ax = plt.subplot(111, projection='3d')\n",
"ax.view_init(10, -70)\n",
"ax.scatter(X_inverse[:, 0], X_inverse[:, 1], X_inverse[:, 2], c=t, cmap=plt.cm.hot, marker=\"x\")\n",
"ax.set_xlabel(\"\")\n",
"ax.set_ylabel(\"\")\n",
"ax.set_zlabel(\"\")\n",
"ax.set_xticklabels([])\n",
"ax.set_yticklabels([])\n",
"ax.set_zticklabels([])\n",
"\n",
"save_fig(\"preimage_plot\", tight_layout=False)\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 59,
2016-09-27 23:31:21 +02:00
"metadata": {
2017-02-17 11:51:26 +01:00
"collapsed": false,
"deletable": true,
"editable": true
2016-09-27 23:31:21 +02:00
},
"outputs": [],
"source": [
"X_reduced = rbf_pca.fit_transform(X)\n",
"\n",
"plt.figure(figsize=(11, 4))\n",
"plt.subplot(132)\n",
"plt.scatter(X_reduced[:, 0], X_reduced[:, 1], c=t, cmap=plt.cm.hot, marker=\"x\")\n",
"plt.xlabel(\"$z_1$\", fontsize=18)\n",
"plt.ylabel(\"$z_2$\", fontsize=18, rotation=0)\n",
"plt.grid(True)"
]
},
{
"cell_type": "code",
"execution_count": 60,
2016-09-27 23:31:21 +02:00
"metadata": {
2017-02-17 11:51:26 +01:00
"collapsed": false,
"deletable": true,
"editable": true
2016-09-27 23:31:21 +02:00
},
"outputs": [],
"source": [
2016-11-05 14:26:57 +01:00
"from sklearn.model_selection import GridSearchCV\n",
"from sklearn.linear_model import LogisticRegression\n",
"from sklearn.pipeline import Pipeline\n",
2016-09-27 23:31:21 +02:00
"\n",
"clf = Pipeline([\n",
" (\"kpca\", KernelPCA(n_components=2)),\n",
" (\"log_reg\", LogisticRegression())\n",
" ])\n",
"\n",
"param_grid = [{\n",
" \"kpca__gamma\": np.linspace(0.03, 0.05, 10),\n",
" \"kpca__kernel\": [\"rbf\", \"sigmoid\"]\n",
" }]\n",
2016-09-27 23:31:21 +02:00
"\n",
"grid_search = GridSearchCV(clf, param_grid, cv=3)\n",
"grid_search.fit(X, y)"
2016-09-27 23:31:21 +02:00
]
},
{
"cell_type": "code",
"execution_count": 61,
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"print(grid_search.best_params_)"
]
},
{
"cell_type": "code",
"execution_count": 62,
2016-09-27 23:31:21 +02:00
"metadata": {
2017-02-17 11:51:26 +01:00
"collapsed": false,
"deletable": true,
"editable": true
2016-09-27 23:31:21 +02:00
},
"outputs": [],
"source": [
"rbf_pca = KernelPCA(n_components = 2, kernel=\"rbf\", gamma=0.0433,\n",
" fit_inverse_transform=True)\n",
"X_reduced = rbf_pca.fit_transform(X)\n",
"X_preimage = rbf_pca.inverse_transform(X_reduced)"
]
},
{
"cell_type": "code",
"execution_count": 63,
2016-09-27 23:31:21 +02:00
"metadata": {
2017-02-17 11:51:26 +01:00
"collapsed": false,
"deletable": true,
"editable": true
2016-09-27 23:31:21 +02:00
},
"outputs": [],
"source": [
"from sklearn.metrics import mean_squared_error\n",
"\n",
2016-09-27 23:31:21 +02:00
"mean_squared_error(X, X_preimage)"
]
},
{
"cell_type": "markdown",
2016-09-27 23:31:21 +02:00
"metadata": {
2017-02-17 11:51:26 +01:00
"deletable": true,
"editable": true
2016-09-27 23:31:21 +02:00
},
"source": [
"# LLE"
2016-09-27 23:31:21 +02:00
]
},
{
"cell_type": "code",
"execution_count": 64,
2016-09-27 23:31:21 +02:00
"metadata": {
"collapsed": true,
"deletable": true,
"editable": true
2016-09-27 23:31:21 +02:00
},
"outputs": [],
"source": [
"X, t = make_swiss_roll(n_samples=1000, noise=0.2, random_state=41)"
2016-09-27 23:31:21 +02:00
]
},
{
"cell_type": "code",
"execution_count": 65,
2017-02-17 11:51:26 +01:00
"metadata": {
"collapsed": false,
2017-02-17 11:51:26 +01:00
"deletable": true,
"editable": true
},
"outputs": [],
2016-09-27 23:31:21 +02:00
"source": [
"from sklearn.manifold import LocallyLinearEmbedding\n",
"\n",
"lle = LocallyLinearEmbedding(n_components=2, n_neighbors=10, random_state=42)\n",
"X_reduced = lle.fit_transform(X)"
2016-09-27 23:31:21 +02:00
]
},
{
"cell_type": "code",
"execution_count": 66,
2016-09-27 23:31:21 +02:00
"metadata": {
"collapsed": false,
"deletable": true,
"editable": true
2016-09-27 23:31:21 +02:00
},
"outputs": [],
"source": [
"plt.title(\"Unrolled swiss roll using LLE\", fontsize=14)\n",
"plt.scatter(X_reduced[:, 0], X_reduced[:, 1], c=t, cmap=plt.cm.hot)\n",
"plt.xlabel(\"$z_1$\", fontsize=18)\n",
"plt.ylabel(\"$z_2$\", fontsize=18)\n",
"plt.axis([-0.065, 0.055, -0.1, 0.12])\n",
"plt.grid(True)\n",
"\n",
"save_fig(\"lle_unrolling_plot\")\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
2017-02-17 11:51:26 +01:00
"metadata": {
"deletable": true,
"editable": true
},
2016-09-27 23:31:21 +02:00
"source": [
"# MDS, Isomap and t-SNE"
]
},
{
"cell_type": "code",
"execution_count": 67,
2016-09-27 23:31:21 +02:00
"metadata": {
2017-02-17 11:51:26 +01:00
"collapsed": false,
"deletable": true,
"editable": true
2016-09-27 23:31:21 +02:00
},
"outputs": [],
"source": [
"from sklearn.manifold import MDS\n",
"\n",
2016-09-27 23:31:21 +02:00
"mds = MDS(n_components=2, random_state=42)\n",
"X_reduced_mds = mds.fit_transform(X)"
]
},
{
"cell_type": "code",
"execution_count": 68,
2016-09-27 23:31:21 +02:00
"metadata": {
2017-02-17 11:51:26 +01:00
"collapsed": false,
"deletable": true,
"editable": true
2016-09-27 23:31:21 +02:00
},
"outputs": [],
"source": [
"from sklearn.manifold import Isomap\n",
"\n",
2016-09-27 23:31:21 +02:00
"isomap = Isomap(n_components=2)\n",
"X_reduced_isomap = isomap.fit_transform(X)"
]
},
{
"cell_type": "code",
"execution_count": 69,
2016-09-27 23:31:21 +02:00
"metadata": {
2017-02-17 11:51:26 +01:00
"collapsed": true,
"deletable": true,
"editable": true
2016-09-27 23:31:21 +02:00
},
"outputs": [],
"source": [
"from sklearn.manifold import TSNE\n",
"\n",
"tsne = TSNE(n_components=2, random_state=42)\n",
2016-09-27 23:31:21 +02:00
"X_reduced_tsne = tsne.fit_transform(X)"
]
},
{
"cell_type": "code",
"execution_count": 70,
2016-09-27 23:31:21 +02:00
"metadata": {
2017-02-17 11:51:26 +01:00
"collapsed": false,
"deletable": true,
"editable": true
2016-09-27 23:31:21 +02:00
},
"outputs": [],
"source": [
"from sklearn.discriminant_analysis import LinearDiscriminantAnalysis\n",
"\n",
2016-09-27 23:31:21 +02:00
"lda = LinearDiscriminantAnalysis(n_components=2)\n",
"X_mnist = mnist[\"data\"]\n",
"y_mnist = mnist[\"target\"]\n",
"lda.fit(X_mnist, y_mnist)\n",
"X_reduced_lda = lda.transform(X_mnist)"
]
},
{
"cell_type": "code",
"execution_count": 71,
2016-09-27 23:31:21 +02:00
"metadata": {
2017-02-17 11:51:26 +01:00
"collapsed": false,
"deletable": true,
"editable": true
2016-09-27 23:31:21 +02:00
},
"outputs": [],
"source": [
"titles = [\"MDS\", \"Isomap\", \"t-SNE\"]\n",
"\n",
"plt.figure(figsize=(11,4))\n",
"\n",
"for subplot, title, X_reduced in zip((131, 132, 133), titles,\n",
" (X_reduced_mds, X_reduced_isomap, X_reduced_tsne)):\n",
" plt.subplot(subplot)\n",
" plt.title(title, fontsize=14)\n",
" plt.scatter(X_reduced[:, 0], X_reduced[:, 1], c=t, cmap=plt.cm.hot)\n",
" plt.xlabel(\"$z_1$\", fontsize=18)\n",
" if subplot == 131:\n",
" plt.ylabel(\"$z_2$\", fontsize=18, rotation=0)\n",
" plt.grid(True)\n",
"\n",
"save_fig(\"other_dim_reduction_plot\")\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {
2017-02-17 11:51:26 +01:00
"collapsed": true,
"deletable": true,
"editable": true
2016-09-27 23:31:21 +02:00
},
"source": [
"# Exercise solutions"
]
},
{
"cell_type": "markdown",
2017-02-17 11:51:26 +01:00
"metadata": {
"deletable": true,
"editable": true
},
2016-09-27 23:31:21 +02:00
"source": [
"**Coming soon**"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
2017-02-17 11:51:26 +01:00
"collapsed": true,
"deletable": true,
"editable": true
2016-09-27 23:31:21 +02:00
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.3"
2016-09-27 23:31:21 +02:00
},
"nav_menu": {
"height": "352px",
"width": "458px"
},
"toc": {
"navigate_menu": true,
"number_sections": true,
"sideBar": true,
"threshold": 6,
"toc_cell": false,
"toc_section_display": "block",
"toc_window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 0
}