diff --git a/07_dimensionality_reduction.ipynb b/07_dimensionality_reduction.ipynb
index d466a34..f80126a 100644
--- a/07_dimensionality_reduction.ipynb
+++ b/07_dimensionality_reduction.ipynb
@@ -30,7 +30,9 @@
},
{
"cell_type": "markdown",
- "metadata": {},
+ "metadata": {
+ "tags": []
+ },
"source": [
"# Setup"
]
@@ -39,7 +41,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "First, let's import a few common modules, ensure MatplotLib plots figures inline and prepare a function to save the figures."
+ "This project requires Python 3.8 or above:"
]
},
{
@@ -48,27 +50,64 @@
"metadata": {},
"outputs": [],
"source": [
- "# Python ≥3.8 is required\n",
"import sys\n",
- "assert sys.version_info >= (3, 8)\n",
"\n",
- "# Common imports\n",
- "import numpy as np\n",
+ "assert sys.version_info >= (3, 8)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "It also requires Scikit-Learn ≥ 1.0.1:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import sklearn\n",
+ "\n",
+ "assert sklearn.__version__ >= \"1.0.1\""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "As we did in previous chapters, let's define the default font sizes to make the figures prettier:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import matplotlib as mpl\n",
+ "\n",
+ "mpl.rc('font', size=12)\n",
+ "mpl.rc('axes', labelsize=14, titlesize=14)\n",
+ "mpl.rc('legend', fontsize=14)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "And let's create the `images/dim_reduction` folder (if it doesn't already exist), and define the `save_fig()` function which is used through this notebook to save the figures in high-res for the book:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [],
+ "source": [
"from pathlib import Path\n",
"\n",
- "# Scikit-Learn ≥1.0 is required\n",
- "import sklearn\n",
- "assert sklearn.__version__ >= \"1.0\"\n",
- "\n",
- "# To plot pretty figures\n",
- "%matplotlib inline\n",
- "import matplotlib as mpl\n",
- "import matplotlib.pyplot as plt\n",
- "mpl.rc('axes', labelsize=14)\n",
- "mpl.rc('xtick', labelsize=12)\n",
- "mpl.rc('ytick', labelsize=12)\n",
- "\n",
- "# Where to save the figures\n",
"IMAGES_PATH = Path() / \"images\" / \"dim_reduction\"\n",
"IMAGES_PATH.mkdir(parents=True, exist_ok=True)\n",
"\n",
@@ -83,28 +122,388 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "# PCA\n",
- "Let's build a simple 3D dataset:"
+ "# PCA"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Let's generate a small 3D dataset:"
]
},
{
"cell_type": "code",
- "execution_count": 2,
+ "execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
- "np.random.seed(4)\n",
- "m = 60\n",
- "w1, w2 = 0.1, 0.3\n",
- "noise = 0.1\n",
+ "# not in the book\n",
"\n",
- "angles = np.random.rand(m) * 3 * np.pi / 2 - 0.5\n",
+ "import numpy as np\n",
+ "\n",
+ "np.random.seed(42)\n",
+ "m = 60\n",
+ "w1, w2 = 0.2, 0.5\n",
+ "noise = 0.2\n",
+ "angles = np.random.rand(m) * 2 * np.pi * 0.8 + np.pi / 2\n",
"X = np.empty((m, 3))\n",
- "X[:, 0] = np.cos(angles) + np.sin(angles)/2 + noise * np.random.randn(m) / 2\n",
+ "X[:, 0] = np.cos(angles) + np.sin(angles) / 2 + noise * np.random.randn(m) / 2\n",
"X[:, 1] = np.sin(angles) * 0.7 + noise * np.random.randn(m) / 2\n",
"X[:, 2] = X[:, 0] * w1 + X[:, 1] * w2 + noise * np.random.randn(m)"
]
},
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "This chapter starts with several figures to explain the concepts of PCA and Manifold Learning. Below is the code to generate these figures. You can skip directly to the [Principal Components](#Principal-Components) section below."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**Code to generate Figure 8–2. A 3D dataset lying close to a 2D subspace:**"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Plot the 3D dataset, with the projection plane."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# not in the book\n",
+ "\n",
+ "import matplotlib.pyplot as plt\n",
+ "from mpl_toolkits.mplot3d import Axes3D\n",
+ "from sklearn.decomposition import PCA\n",
+ "\n",
+ "pca = PCA(n_components=2)\n",
+ "X2D = pca.fit_transform(X) # dataset reduced to 2D\n",
+ "X3D_inv = pca.inverse_transform(X2D) # 3D position of the projected samples\n",
+ "X_centered = X - X.mean(axis=0)\n",
+ "U, s, Vt = np.linalg.svd(X_centered)\n",
+ "\n",
+ "axes = [-1.4, 1.4, -1.4, 1.4, -1.1, 1.1]\n",
+ "x1, x2 = np.meshgrid(np.linspace(axes[0], axes[1], 10),\n",
+ " np.linspace(axes[2], axes[3], 10))\n",
+ "w1, w2 = np.linalg.solve(Vt[:2, :2], Vt[:2, 2]) # projection plane coefs\n",
+ "z = w1 * (x1 - pca.mean_[0]) + w2 * (x2 - pca.mean_[1]) - pca.mean_[2] # plane\n",
+ "X3D_above = X[X[:, 2] >= X3D_inv[:, 2]] # samples above plane\n",
+ "X3D_below = X[X[:, 2] < X3D_inv[:, 2]] # samples below plane\n",
+ "\n",
+ "fig = plt.figure(figsize=(9, 9))\n",
+ "ax = fig.add_subplot(111, projection=\"3d\")\n",
+ "\n",
+ "# plot samples and projection lines below plane first\n",
+ "ax.plot(X3D_below[:, 0], X3D_below[:, 1], X3D_below[:, 2], \"ro\", alpha=0.3)\n",
+ "for i in range(m):\n",
+ " if X[i, 2] < X3D_inv[i, 2]:\n",
+ " ax.plot([X[i][0], X3D_inv[i][0]],\n",
+ " [X[i][1], X3D_inv[i][1]],\n",
+ " [X[i][2], X3D_inv[i][2]], \":\", color=\"#F88\")\n",
+ "\n",
+ "ax.plot_surface(x1, x2, z, alpha=0.1, color=\"b\") # projection plane\n",
+ "ax.plot(X3D_inv[:, 0], X3D_inv[:, 1], X3D_inv[:, 2], \"b+\") # projected samples\n",
+ "ax.plot(X3D_inv[:, 0], X3D_inv[:, 1], X3D_inv[:, 2], \"b.\")\n",
+ "\n",
+ "# now plot projection lines and samples above plane\n",
+ "for i in range(m):\n",
+ " if X[i, 2] >= X3D_inv[i, 2]:\n",
+ " ax.plot([X[i][0], X3D_inv[i][0]],\n",
+ " [X[i][1], X3D_inv[i][1]],\n",
+ " [X[i][2], X3D_inv[i][2]], \"r--\")\n",
+ "\n",
+ "ax.plot(X3D_above[:, 0], X3D_above[:, 1], X3D_above[:, 2], \"ro\")\n",
+ "\n",
+ "def set_xyz_axes(ax, axes):\n",
+ " ax.xaxis.set_rotate_label(False)\n",
+ " ax.yaxis.set_rotate_label(False)\n",
+ " ax.zaxis.set_rotate_label(False)\n",
+ " ax.set_xlabel(\"$x_1$\", labelpad=8, rotation=0)\n",
+ " ax.set_ylabel(\"$x_2$\", labelpad=8, rotation=0)\n",
+ " ax.set_zlabel(\"$x_3$\", labelpad=8, rotation=0)\n",
+ " ax.set_xlim(axes[0:2])\n",
+ " ax.set_ylim(axes[2:4])\n",
+ " ax.set_zlim(axes[4:6])\n",
+ "\n",
+ "set_xyz_axes(ax, axes)\n",
+ "ax.set_zticks([-1, -0.5, 0, 0.5, 1])\n",
+ "\n",
+ "save_fig(\"dataset_3d_plot\", tight_layout=False)\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**Code to generate Figure 8–3. The new 2D dataset after projection:**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# not in the book\n",
+ "\n",
+ "fig = plt.figure()\n",
+ "ax = fig.add_subplot(1, 1, 1, aspect='equal')\n",
+ "ax.plot(X2D[:, 0], X2D[:, 1], \"b+\")\n",
+ "ax.plot(X2D[:, 0], X2D[:, 1], \"b.\")\n",
+ "ax.plot([0], [0], \"bo\")\n",
+ "ax.arrow(0, 0, 1, 0, head_width=0.05, length_includes_head=True,\n",
+ " head_length=0.1, fc='b', ec='b', linewidth=4)\n",
+ "ax.arrow(0, 0, 0, 1, head_width=0.05, length_includes_head=True,\n",
+ " head_length=0.1, fc='b', ec='b', linewidth=1)\n",
+ "ax.set_xlabel(\"$z_1$\")\n",
+ "ax.set_yticks([-0.5, 0, 0.5, 1])\n",
+ "ax.set_ylabel(\"$z_2$\", rotation=0)\n",
+ "ax.set_axisbelow(True)\n",
+ "ax.grid(True)\n",
+ "save_fig(\"dataset_2d_plot\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**Code to generate Figure 8–4. Swiss roll dataset:**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sklearn.datasets import make_swiss_roll\n",
+ "\n",
+ "X, t = make_swiss_roll(n_samples=1000, noise=0.2, random_state=42)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from matplotlib.colors import ListedColormap\n",
+ "\n",
+ "darker_hot = ListedColormap(plt.cm.hot(np.linspace(0, 0.8, 256)))\n",
+ "\n",
+ "axes = [-11.5, 14, -2, 23, -12, 15]\n",
+ "\n",
+ "fig = plt.figure(figsize=(6, 5))\n",
+ "ax = fig.add_subplot(111, projection='3d')\n",
+ "\n",
+ "ax.scatter(X[:, 0], X[:, 1], X[:, 2], c=t, cmap=darker_hot)\n",
+ "ax.view_init(10, -70)\n",
+ "set_xyz_axes(ax, axes)\n",
+ "save_fig(\"swiss_roll_plot\")\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**Code to generate Figure 8–5. Squashing by projecting onto a plane (left) versus unrolling the Swiss roll (right):**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "plt.figure(figsize=(10, 4))\n",
+ "\n",
+ "plt.subplot(121)\n",
+ "plt.scatter(X[:, 0], X[:, 1], c=t, cmap=darker_hot)\n",
+ "plt.axis(axes[:4])\n",
+ "plt.xlabel(\"$x_1$\")\n",
+ "plt.ylabel(\"$x_2$\", labelpad=10, rotation=0)\n",
+ "plt.grid(True)\n",
+ "\n",
+ "plt.subplot(122)\n",
+ "plt.scatter(t, X[:, 1], c=t, cmap=darker_hot)\n",
+ "plt.axis([4, 14.8, axes[2], axes[3]])\n",
+ "plt.xlabel(\"$z_1$\")\n",
+ "plt.grid(True)\n",
+ "\n",
+ "save_fig(\"squished_swiss_roll_plot\")\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**Code to generate Figure 8–6. The decision boundary may not always be simpler with lower dimensions:**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "axes = [-11.5, 14, -2, 23, -12, 15]\n",
+ "x2s = np.linspace(axes[2], axes[3], 10)\n",
+ "x3s = np.linspace(axes[4], axes[5], 10)\n",
+ "x2, x3 = np.meshgrid(x2s, x3s)\n",
+ "\n",
+ "positive_class = X[:, 0] > 5\n",
+ "X_pos = X[positive_class]\n",
+ "X_neg = X[~positive_class]\n",
+ "\n",
+ "fig = plt.figure(figsize=(6, 5))\n",
+ "ax = plt.subplot(1, 1, 1, projection='3d')\n",
+ "ax.view_init(10, -70)\n",
+ "ax.plot(X_neg[:, 0], X_neg[:, 1], X_neg[:, 2], \"y^\")\n",
+ "ax.plot_wireframe(5, x2, x3, alpha=0.5)\n",
+ "ax.plot(X_pos[:, 0], X_pos[:, 1], X_pos[:, 2], \"gs\")\n",
+ "set_xyz_axes(ax, axes)\n",
+ "save_fig(\"manifold_decision_boundary_plot1\")\n",
+ "plt.show()\n",
+ "\n",
+ "fig = plt.figure(figsize=(5, 4))\n",
+ "ax = plt.subplot(1, 1, 1)\n",
+ "ax.plot(t[positive_class], X[positive_class, 1], \"gs\")\n",
+ "ax.plot(t[~positive_class], X[~positive_class, 1], \"y^\")\n",
+ "ax.axis([4, 15, axes[2], axes[3]])\n",
+ "ax.set_xlabel(\"$z_1$\")\n",
+ "ax.set_ylabel(\"$z_2$\", rotation=0, labelpad=8)\n",
+ "ax.grid(True)\n",
+ "save_fig(\"manifold_decision_boundary_plot2\")\n",
+ "plt.show()\n",
+ "\n",
+ "positive_class = 2 * (t[:] - 4) > X[:, 1]\n",
+ "X_pos = X[positive_class]\n",
+ "X_neg = X[~positive_class]\n",
+ "\n",
+ "fig = plt.figure(figsize=(6, 5))\n",
+ "ax = plt.subplot(1, 1, 1, projection='3d')\n",
+ "ax.view_init(10, -70)\n",
+ "ax.plot(X_neg[:, 0], X_neg[:, 1], X_neg[:, 2], \"y^\")\n",
+ "ax.plot(X_pos[:, 0], X_pos[:, 1], X_pos[:, 2], \"gs\")\n",
+ "ax.xaxis.set_rotate_label(False)\n",
+ "ax.yaxis.set_rotate_label(False)\n",
+ "ax.zaxis.set_rotate_label(False)\n",
+ "ax.set_xlabel(\"$x_1$\", rotation=0)\n",
+ "ax.set_ylabel(\"$x_2$\", rotation=0)\n",
+ "ax.set_zlabel(\"$x_3$\", rotation=0)\n",
+ "ax.set_xlim(axes[0:2])\n",
+ "ax.set_ylim(axes[2:4])\n",
+ "ax.set_zlim(axes[4:6])\n",
+ "save_fig(\"manifold_decision_boundary_plot3\")\n",
+ "plt.show()\n",
+ "\n",
+ "fig = plt.figure(figsize=(5, 4))\n",
+ "ax = plt.subplot(1, 1, 1)\n",
+ "ax.plot(t[positive_class], X[positive_class, 1], \"gs\")\n",
+ "ax.plot(t[~positive_class], X[~positive_class, 1], \"y^\")\n",
+ "ax.plot([4, 15], [0, 22], \"b-\", linewidth=2)\n",
+ "ax.axis([4, 15, axes[2], axes[3]])\n",
+ "ax.set_xlabel(\"$z_1$\")\n",
+ "ax.set_ylabel(\"$z_2$\", rotation=0, labelpad=8)\n",
+ "ax.grid(True)\n",
+ "save_fig(\"manifold_decision_boundary_plot4\")\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**Code to generate Figure 8–7. Selecting the subspace to project on:**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "angle = np.pi / 5\n",
+ "stretch = 5\n",
+ "m = 200\n",
+ "\n",
+ "np.random.seed(3)\n",
+ "X = np.random.randn(m, 2) / 10\n",
+ "X = X @ np.array([[stretch, 0], [0, 1]]) # stretch\n",
+ "X = X @ [[np.cos(angle), np.sin(angle)],\n",
+ " [np.sin(angle), np.cos(angle)]] # rotate\n",
+ "\n",
+ "u1 = np.array([np.cos(angle), np.sin(angle)])\n",
+ "u2 = np.array([np.cos(angle - 2 * np.pi / 6), np.sin(angle - 2 * np.pi / 6)])\n",
+ "u3 = np.array([np.cos(angle - np.pi / 2), np.sin(angle - np.pi / 2)])\n",
+ "\n",
+ "X_proj1 = X @ u1.reshape(-1, 1)\n",
+ "X_proj2 = X @ u2.reshape(-1, 1)\n",
+ "X_proj3 = X @ u3.reshape(-1, 1)\n",
+ "\n",
+ "plt.figure(figsize=(8, 4))\n",
+ "plt.subplot2grid((3, 2), (0, 0), rowspan=3)\n",
+ "plt.plot([-1.4, 1.4], [-1.4 * u1[1] / u1[0], 1.4 * u1[1] / u1[0]], \"k-\",\n",
+ " linewidth=2)\n",
+ "plt.plot([-1.4, 1.4], [-1.4 * u2[1] / u2[0], 1.4 * u2[1] / u2[0]], \"k--\",\n",
+ " linewidth=2)\n",
+ "plt.plot([-1.4, 1.4], [-1.4 * u3[1] / u3[0], 1.4 * u3[1] / u3[0]], \"k:\",\n",
+ " linewidth=2)\n",
+ "plt.plot(X[:, 0], X[:, 1], \"ro\", alpha=0.5)\n",
+ "plt.arrow(0, 0, u1[0], u1[1], head_width=0.1, linewidth=4, alpha=0.9,\n",
+ " length_includes_head=True, head_length=0.1, fc=\"b\", ec=\"b\", zorder=10)\n",
+ "plt.arrow(0, 0, u3[0], u3[1], head_width=0.1, linewidth=1, alpha=0.9,\n",
+ " length_includes_head=True, head_length=0.1, fc=\"b\", ec=\"b\", zorder=10)\n",
+ "plt.text(u1[0] + 0.1, u1[1] - 0.05, r\"$\\mathbf{c_1}$\",\n",
+ " color=\"blue\", fontsize=14)\n",
+ "plt.text(u3[0] + 0.1, u3[1], r\"$\\mathbf{c_2}$\",\n",
+ " color=\"blue\", fontsize=14)\n",
+ "plt.xlabel(\"$x_1$\")\n",
+ "plt.ylabel(\"$x_2$\", rotation=0)\n",
+ "plt.axis([-1.4, 1.4, -1.4, 1.4])\n",
+ "plt.grid()\n",
+ "\n",
+ "plt.subplot2grid((3, 2), (0, 1))\n",
+ "plt.plot([-2, 2], [0, 0], \"k-\", linewidth=2)\n",
+ "plt.plot(X_proj1[:, 0], np.zeros(m), \"ro\", alpha=0.3)\n",
+ "plt.gca().get_yaxis().set_ticks([])\n",
+ "plt.gca().get_xaxis().set_ticklabels([])\n",
+ "plt.axis([-2, 2, -1, 1])\n",
+ "plt.grid()\n",
+ "\n",
+ "plt.subplot2grid((3, 2), (1, 1))\n",
+ "plt.plot([-2, 2], [0, 0], \"k--\", linewidth=2)\n",
+ "plt.plot(X_proj2[:, 0], np.zeros(m), \"ro\", alpha=0.3)\n",
+ "plt.gca().get_yaxis().set_ticks([])\n",
+ "plt.gca().get_xaxis().set_ticklabels([])\n",
+ "plt.axis([-2, 2, -1, 1])\n",
+ "plt.grid()\n",
+ "\n",
+ "plt.subplot2grid((3, 2), (2, 1))\n",
+ "plt.plot([-2, 2], [0, 0], \"k:\", linewidth=2)\n",
+ "plt.plot(X_proj3[:, 0], np.zeros(m), \"ro\", alpha=0.3)\n",
+ "plt.gca().get_yaxis().set_ticks([])\n",
+ "plt.axis([-2, 2, -1, 1])\n",
+ "plt.xlabel(\"$z_1$\")\n",
+ "plt.grid()\n",
+ "\n",
+ "save_fig(\"pca_best_projection_plot\")\n",
+ "plt.show()"
+ ]
+ },
{
"cell_type": "markdown",
"metadata": {},
@@ -114,35 +513,37 @@
},
{
"cell_type": "code",
- "execution_count": 3,
+ "execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
+ "import numpy as np\n",
+ "\n",
+ "# X = [...] # the small 3D dataset was created ealier in this notebook\n",
"X_centered = X - X.mean(axis=0)\n",
"U, s, Vt = np.linalg.svd(X_centered)\n",
- "c1 = Vt.T[:, 0]\n",
- "c2 = Vt.T[:, 1]"
+ "c1 = Vt[0]\n",
+ "c2 = Vt[1]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Note: in principle, the SVD factorization algorithm returns three matrices, **U**, **Σ** and **V**, such that **X** = **UΣV**⊺, where **U** is an _m_ × _m_ matrix, **Σ** is an _m_ × _n_ matrix, and **V** is an _n_ × _n_ matrix. But the `svd()` function returns **U**, **s** and **V**⊺ instead. **s** is the vector containing all the values on the main diagonal of the top _n_ rows of **Σ**. Since **Σ** is full of zeros elsewhere, your can easily reconstruct it from **s**, like this:"
]
},
{
"cell_type": "code",
- "execution_count": 4,
+ "execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
+ "# not in the book\n",
"m, n = X.shape\n",
- "\n",
- "S = np.zeros(X_centered.shape)\n",
- "S[:n, :n] = np.diag(s)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 5,
- "metadata": {},
- "outputs": [],
- "source": [
- "np.allclose(X_centered, U.dot(S).dot(Vt))"
+ "Σ = np.zeros_like(X_centered)\n",
+ "Σ[:n, :n] = np.diag(s)\n",
+ "assert np.allclose(X_centered, U @ Σ @ Vt)"
]
},
{
@@ -154,21 +555,12 @@
},
{
"cell_type": "code",
- "execution_count": 6,
+ "execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
- "W2 = Vt.T[:, :2]\n",
- "X2D = X_centered.dot(W2)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 7,
- "metadata": {},
- "outputs": [],
- "source": [
- "X2D_using_svd = X2D"
+ "W2 = Vt[:2].T\n",
+ "X2D = X_centered @ W2"
]
},
{
@@ -187,7 +579,7 @@
},
{
"cell_type": "code",
- "execution_count": 8,
+ "execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
@@ -197,127 +589,6 @@
"X2D = pca.fit_transform(X)"
]
},
- {
- "cell_type": "code",
- "execution_count": 9,
- "metadata": {},
- "outputs": [],
- "source": [
- "X2D[:5]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 10,
- "metadata": {},
- "outputs": [],
- "source": [
- "X2D_using_svd[:5]"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Notice that running PCA multiple times on slightly different datasets may result in different results. In general the only difference is that some axes may be flipped. In this example, PCA using Scikit-Learn gives the same projection as the one given by the SVD approach, except both axes are flipped:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 11,
- "metadata": {},
- "outputs": [],
- "source": [
- "np.allclose(X2D, -X2D_using_svd)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Recover the 3D points projected on the plane (PCA 2D subspace)."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 12,
- "metadata": {},
- "outputs": [],
- "source": [
- "X3D_inv = pca.inverse_transform(X2D)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Of course, there was some loss of information during the projection step, so the recovered 3D points are not exactly equal to the original 3D points:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 13,
- "metadata": {},
- "outputs": [],
- "source": [
- "np.allclose(X3D_inv, X)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "We can compute the reconstruction error:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 14,
- "metadata": {},
- "outputs": [],
- "source": [
- "np.mean(np.sum(np.square(X3D_inv - X), axis=1))"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "The inverse transform in the SVD approach looks like this:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 15,
- "metadata": {},
- "outputs": [],
- "source": [
- "X3D_inv_using_svd = X2D_using_svd.dot(Vt[:2, :])"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "The reconstructions from both methods are not identical because Scikit-Learn's `PCA` class automatically takes care of reversing the mean centering, but if we subtract the mean, we get the same reconstruction:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 16,
- "metadata": {},
- "outputs": [],
- "source": [
- "np.allclose(X3D_inv_using_svd, X3D_inv - pca.mean_)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "The `PCA` object gives access to the principal components that it computed:"
- ]
- },
{
"cell_type": "code",
"execution_count": 17,
@@ -331,23 +602,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "Compare to the first two principal components computed using the SVD method:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 18,
- "metadata": {},
- "outputs": [],
- "source": [
- "Vt[:2]"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Notice how the axes are flipped."
+ "Recover the 3D points projected on the plane (PCA 2D subspace)."
]
},
{
@@ -366,7 +621,7 @@
},
{
"cell_type": "code",
- "execution_count": 19,
+ "execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
@@ -377,425 +632,23 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "The first dimension explains 84.2% of the variance, while the second explains 14.6%."
+ "The first dimension explains about 68% of the variance, while the second explains about 28%."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "By projecting down to 2D, we lost about 1.1% of the variance:"
+ "By projecting down to 2D, we lost about 4% of the variance:"
]
},
{
"cell_type": "code",
- "execution_count": 20,
+ "execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
- "1 - pca.explained_variance_ratio_.sum()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Here is how to compute the explained variance ratio using the SVD approach (recall that `s` is the diagonal of the matrix `S`):"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 21,
- "metadata": {},
- "outputs": [],
- "source": [
- "np.square(s) / np.square(s).sum()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Next, let's generate some nice figures! :)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "**Code to generate Figure 8–2. A 3D dataset lying close to a 2D subspace:**"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Utility class to draw 3D arrows (copied from http://stackoverflow.com/questions/11140163)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 22,
- "metadata": {},
- "outputs": [],
- "source": [
- "from matplotlib.patches import FancyArrowPatch\n",
- "from mpl_toolkits.mplot3d import proj3d\n",
- "\n",
- "class Arrow3D(FancyArrowPatch):\n",
- " def __init__(self, xs, ys, zs, *args, **kwargs):\n",
- " FancyArrowPatch.__init__(self, (0,0), (0,0), *args, **kwargs)\n",
- " self._verts3d = xs, ys, zs\n",
- "\n",
- " def draw(self, renderer):\n",
- " xs3d, ys3d, zs3d = self._verts3d\n",
- " xs, ys, zs = proj3d.proj_transform(xs3d, ys3d, zs3d, renderer.M)\n",
- " self.set_positions((xs[0],ys[0]),(xs[1],ys[1]))\n",
- " FancyArrowPatch.draw(self, renderer)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Express the plane as a function of x and y."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 23,
- "metadata": {},
- "outputs": [],
- "source": [
- "axes = [-1.8, 1.8, -1.3, 1.3, -1.0, 1.0]\n",
- "\n",
- "x1s = np.linspace(axes[0], axes[1], 10)\n",
- "x2s = np.linspace(axes[2], axes[3], 10)\n",
- "x1, x2 = np.meshgrid(x1s, x2s)\n",
- "\n",
- "C = pca.components_\n",
- "R = C.T.dot(C)\n",
- "z = (R[0, 2] * x1 + R[1, 2] * x2) / (1 - R[2, 2])"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Plot the 3D dataset, the plane and the projections on that plane."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 24,
- "metadata": {},
- "outputs": [],
- "source": [
- "from mpl_toolkits.mplot3d import Axes3D\n",
- "\n",
- "fig = plt.figure(figsize=(6, 3.8))\n",
- "ax = fig.add_subplot(111, projection='3d')\n",
- "\n",
- "X3D_above = X[X[:, 2] > X3D_inv[:, 2]]\n",
- "X3D_below = X[X[:, 2] <= X3D_inv[:, 2]]\n",
- "\n",
- "ax.plot(X3D_below[:, 0], X3D_below[:, 1], X3D_below[:, 2], \"bo\", alpha=0.5)\n",
- "\n",
- "ax.plot_surface(x1, x2, z, alpha=0.2, color=\"k\")\n",
- "np.linalg.norm(C, axis=0)\n",
- "ax.add_artist(Arrow3D([0, C[0, 0]],[0, C[0, 1]],[0, C[0, 2]], mutation_scale=15, lw=1, arrowstyle=\"-|>\", color=\"k\"))\n",
- "ax.add_artist(Arrow3D([0, C[1, 0]],[0, C[1, 1]],[0, C[1, 2]], mutation_scale=15, lw=1, arrowstyle=\"-|>\", color=\"k\"))\n",
- "ax.plot([0], [0], [0], \"k.\")\n",
- "\n",
- "for i in range(m):\n",
- " if X[i, 2] > X3D_inv[i, 2]:\n",
- " ax.plot([X[i][0], X3D_inv[i][0]], [X[i][1], X3D_inv[i][1]], [X[i][2], X3D_inv[i][2]], \"k-\")\n",
- " else:\n",
- " ax.plot([X[i][0], X3D_inv[i][0]], [X[i][1], X3D_inv[i][1]], [X[i][2], X3D_inv[i][2]], \"k-\", color=\"#505050\")\n",
- " \n",
- "ax.plot(X3D_inv[:, 0], X3D_inv[:, 1], X3D_inv[:, 2], \"k+\")\n",
- "ax.plot(X3D_inv[:, 0], X3D_inv[:, 1], X3D_inv[:, 2], \"k.\")\n",
- "ax.plot(X3D_above[:, 0], X3D_above[:, 1], X3D_above[:, 2], \"bo\")\n",
- "ax.set_xlabel(\"$x_1$\", fontsize=18, labelpad=10)\n",
- "ax.set_ylabel(\"$x_2$\", fontsize=18, labelpad=10)\n",
- "ax.set_zlabel(\"$x_3$\", fontsize=18, labelpad=10)\n",
- "ax.set_xlim(axes[0:2])\n",
- "ax.set_ylim(axes[2:4])\n",
- "ax.set_zlim(axes[4:6])\n",
- "\n",
- "# Note: If you are using Matplotlib 3.0.0, it has a bug and does not\n",
- "# display 3D graphs properly.\n",
- "# See https://github.com/matplotlib/matplotlib/issues/12239\n",
- "# You should upgrade to a later version. If you cannot, then you can\n",
- "# use the following workaround before displaying each 3D graph:\n",
- "# for spine in ax.spines.values():\n",
- "# spine.set_visible(False)\n",
- "\n",
- "save_fig(\"dataset_3d_plot\")\n",
- "plt.show()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "**Code to generate Figure 8–3. The new 2D dataset after projection:**"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 25,
- "metadata": {},
- "outputs": [],
- "source": [
- "fig = plt.figure()\n",
- "ax = fig.add_subplot(111, aspect='equal')\n",
- "\n",
- "ax.plot(X2D[:, 0], X2D[:, 1], \"k+\")\n",
- "ax.plot(X2D[:, 0], X2D[:, 1], \"k.\")\n",
- "ax.plot([0], [0], \"ko\")\n",
- "ax.arrow(0, 0, 0, 1, head_width=0.05, length_includes_head=True, head_length=0.1, fc='k', ec='k')\n",
- "ax.arrow(0, 0, 1, 0, head_width=0.05, length_includes_head=True, head_length=0.1, fc='k', ec='k')\n",
- "ax.set_xlabel(\"$z_1$\", fontsize=18)\n",
- "ax.set_ylabel(\"$z_2$\", fontsize=18, rotation=0)\n",
- "ax.axis([-1.5, 1.3, -1.2, 1.2])\n",
- "ax.grid(True)\n",
- "save_fig(\"dataset_2d_plot\")"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "**Code to generate Figure 8–4. Swiss roll dataset:**"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 26,
- "metadata": {},
- "outputs": [],
- "source": [
- "from sklearn.datasets import make_swiss_roll\n",
- "\n",
- "X, t = make_swiss_roll(n_samples=1000, noise=0.2, random_state=42)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 27,
- "metadata": {},
- "outputs": [],
- "source": [
- "axes = [-11.5, 14, -2, 23, -12, 15]\n",
- "\n",
- "fig = plt.figure(figsize=(6, 5))\n",
- "ax = fig.add_subplot(111, projection='3d')\n",
- "\n",
- "ax.scatter(X[:, 0], X[:, 1], X[:, 2], c=t, cmap=plt.cm.hot)\n",
- "ax.view_init(10, -70)\n",
- "ax.set_xlabel(\"$x_1$\", fontsize=18)\n",
- "ax.set_ylabel(\"$x_2$\", fontsize=18)\n",
- "ax.set_zlabel(\"$x_3$\", fontsize=18)\n",
- "ax.set_xlim(axes[0:2])\n",
- "ax.set_ylim(axes[2:4])\n",
- "ax.set_zlim(axes[4:6])\n",
- "\n",
- "save_fig(\"swiss_roll_plot\")\n",
- "plt.show()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "**Code to generate Figure 8–5. Squashing by projecting onto a plane (left) versus unrolling the Swiss roll (right):**"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 28,
- "metadata": {},
- "outputs": [],
- "source": [
- "plt.figure(figsize=(11, 4))\n",
- "\n",
- "plt.subplot(121)\n",
- "plt.scatter(X[:, 0], X[:, 1], c=t, cmap=plt.cm.hot)\n",
- "plt.axis(axes[:4])\n",
- "plt.xlabel(\"$x_1$\", fontsize=18)\n",
- "plt.ylabel(\"$x_2$\", fontsize=18, rotation=0)\n",
- "plt.grid(True)\n",
- "\n",
- "plt.subplot(122)\n",
- "plt.scatter(t, X[:, 1], c=t, cmap=plt.cm.hot)\n",
- "plt.axis([4, 15, axes[2], axes[3]])\n",
- "plt.xlabel(\"$z_1$\", fontsize=18)\n",
- "plt.grid(True)\n",
- "\n",
- "save_fig(\"squished_swiss_roll_plot\")\n",
- "plt.show()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "**Code to generate Figure 8–6. The decision boundary may not always be simpler with lower dimensions:**"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 29,
- "metadata": {},
- "outputs": [],
- "source": [
- "from matplotlib import gridspec\n",
- "\n",
- "axes = [-11.5, 14, -2, 23, -12, 15]\n",
- "\n",
- "x2s = np.linspace(axes[2], axes[3], 10)\n",
- "x3s = np.linspace(axes[4], axes[5], 10)\n",
- "x2, x3 = np.meshgrid(x2s, x3s)\n",
- "\n",
- "fig = plt.figure(figsize=(6, 5))\n",
- "ax = plt.subplot(111, projection='3d')\n",
- "\n",
- "positive_class = X[:, 0] > 5\n",
- "X_pos = X[positive_class]\n",
- "X_neg = X[~positive_class]\n",
- "ax.view_init(10, -70)\n",
- "ax.plot(X_neg[:, 0], X_neg[:, 1], X_neg[:, 2], \"y^\")\n",
- "ax.plot_wireframe(5, x2, x3, alpha=0.5)\n",
- "ax.plot(X_pos[:, 0], X_pos[:, 1], X_pos[:, 2], \"gs\")\n",
- "ax.set_xlabel(\"$x_1$\", fontsize=18)\n",
- "ax.set_ylabel(\"$x_2$\", fontsize=18)\n",
- "ax.set_zlabel(\"$x_3$\", fontsize=18)\n",
- "ax.set_xlim(axes[0:2])\n",
- "ax.set_ylim(axes[2:4])\n",
- "ax.set_zlim(axes[4:6])\n",
- "\n",
- "save_fig(\"manifold_decision_boundary_plot1\")\n",
- "plt.show()\n",
- "\n",
- "fig = plt.figure(figsize=(5, 4))\n",
- "ax = plt.subplot(111)\n",
- "\n",
- "plt.plot(t[positive_class], X[positive_class, 1], \"gs\")\n",
- "plt.plot(t[~positive_class], X[~positive_class, 1], \"y^\")\n",
- "plt.axis([4, 15, axes[2], axes[3]])\n",
- "plt.xlabel(\"$z_1$\", fontsize=18)\n",
- "plt.ylabel(\"$z_2$\", fontsize=18, rotation=0)\n",
- "plt.grid(True)\n",
- "\n",
- "save_fig(\"manifold_decision_boundary_plot2\")\n",
- "plt.show()\n",
- "\n",
- "fig = plt.figure(figsize=(6, 5))\n",
- "ax = plt.subplot(111, projection='3d')\n",
- "\n",
- "positive_class = 2 * (t[:] - 4) > X[:, 1]\n",
- "X_pos = X[positive_class]\n",
- "X_neg = X[~positive_class]\n",
- "ax.view_init(10, -70)\n",
- "ax.plot(X_neg[:, 0], X_neg[:, 1], X_neg[:, 2], \"y^\")\n",
- "ax.plot(X_pos[:, 0], X_pos[:, 1], X_pos[:, 2], \"gs\")\n",
- "ax.set_xlabel(\"$x_1$\", fontsize=18)\n",
- "ax.set_ylabel(\"$x_2$\", fontsize=18)\n",
- "ax.set_zlabel(\"$x_3$\", fontsize=18)\n",
- "ax.set_xlim(axes[0:2])\n",
- "ax.set_ylim(axes[2:4])\n",
- "ax.set_zlim(axes[4:6])\n",
- "\n",
- "save_fig(\"manifold_decision_boundary_plot3\")\n",
- "plt.show()\n",
- "\n",
- "fig = plt.figure(figsize=(5, 4))\n",
- "ax = plt.subplot(111)\n",
- "\n",
- "plt.plot(t[positive_class], X[positive_class, 1], \"gs\")\n",
- "plt.plot(t[~positive_class], X[~positive_class, 1], \"y^\")\n",
- "plt.plot([4, 15], [0, 22], \"b-\", linewidth=2)\n",
- "plt.axis([4, 15, axes[2], axes[3]])\n",
- "plt.xlabel(\"$z_1$\", fontsize=18)\n",
- "plt.ylabel(\"$z_2$\", fontsize=18, rotation=0)\n",
- "plt.grid(True)\n",
- "\n",
- "save_fig(\"manifold_decision_boundary_plot4\")\n",
- "plt.show()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "**Code to generate Figure 8–7. Selecting the subspace to project on:**"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 30,
- "metadata": {},
- "outputs": [],
- "source": [
- "angle = np.pi / 5\n",
- "stretch = 5\n",
- "m = 200\n",
- "\n",
- "np.random.seed(3)\n",
- "X = np.random.randn(m, 2) / 10\n",
- "X = X.dot(np.array([[stretch, 0],[0, 1]])) # stretch\n",
- "X = X.dot([[np.cos(angle), np.sin(angle)], [-np.sin(angle), np.cos(angle)]]) # rotate\n",
- "\n",
- "u1 = np.array([np.cos(angle), np.sin(angle)])\n",
- "u2 = np.array([np.cos(angle - 2 * np.pi/6), np.sin(angle - 2 * np.pi/6)])\n",
- "u3 = np.array([np.cos(angle - np.pi/2), np.sin(angle - np.pi/2)])\n",
- "\n",
- "X_proj1 = X.dot(u1.reshape(-1, 1))\n",
- "X_proj2 = X.dot(u2.reshape(-1, 1))\n",
- "X_proj3 = X.dot(u3.reshape(-1, 1))\n",
- "\n",
- "plt.figure(figsize=(8,4))\n",
- "plt.subplot2grid((3,2), (0, 0), rowspan=3)\n",
- "plt.plot([-1.4, 1.4], [-1.4*u1[1]/u1[0], 1.4*u1[1]/u1[0]], \"k-\", linewidth=1)\n",
- "plt.plot([-1.4, 1.4], [-1.4*u2[1]/u2[0], 1.4*u2[1]/u2[0]], \"k--\", linewidth=1)\n",
- "plt.plot([-1.4, 1.4], [-1.4*u3[1]/u3[0], 1.4*u3[1]/u3[0]], \"k:\", linewidth=2)\n",
- "plt.plot(X[:, 0], X[:, 1], \"bo\", alpha=0.5)\n",
- "plt.axis([-1.4, 1.4, -1.4, 1.4])\n",
- "plt.arrow(0, 0, u1[0], u1[1], head_width=0.1, linewidth=5, length_includes_head=True, head_length=0.1, fc='k', ec='k')\n",
- "plt.arrow(0, 0, u3[0], u3[1], head_width=0.1, linewidth=5, length_includes_head=True, head_length=0.1, fc='k', ec='k')\n",
- "plt.text(u1[0] + 0.1, u1[1] - 0.05, r\"$\\mathbf{c_1}$\", fontsize=22)\n",
- "plt.text(u3[0] + 0.1, u3[1], r\"$\\mathbf{c_2}$\", fontsize=22)\n",
- "plt.xlabel(\"$x_1$\", fontsize=18)\n",
- "plt.ylabel(\"$x_2$\", fontsize=18, rotation=0)\n",
- "plt.grid(True)\n",
- "\n",
- "plt.subplot2grid((3,2), (0, 1))\n",
- "plt.plot([-2, 2], [0, 0], \"k-\", linewidth=1)\n",
- "plt.plot(X_proj1[:, 0], np.zeros(m), \"bo\", alpha=0.3)\n",
- "plt.gca().get_yaxis().set_ticks([])\n",
- "plt.gca().get_xaxis().set_ticklabels([])\n",
- "plt.axis([-2, 2, -1, 1])\n",
- "plt.grid(True)\n",
- "\n",
- "plt.subplot2grid((3,2), (1, 1))\n",
- "plt.plot([-2, 2], [0, 0], \"k--\", linewidth=1)\n",
- "plt.plot(X_proj2[:, 0], np.zeros(m), \"bo\", alpha=0.3)\n",
- "plt.gca().get_yaxis().set_ticks([])\n",
- "plt.gca().get_xaxis().set_ticklabels([])\n",
- "plt.axis([-2, 2, -1, 1])\n",
- "plt.grid(True)\n",
- "\n",
- "plt.subplot2grid((3,2), (2, 1))\n",
- "plt.plot([-2, 2], [0, 0], \"k:\", linewidth=2)\n",
- "plt.plot(X_proj3[:, 0], np.zeros(m), \"bo\", alpha=0.3)\n",
- "plt.gca().get_yaxis().set_ticks([])\n",
- "plt.axis([-2, 2, -1, 1])\n",
- "plt.xlabel(\"$z_1$\", fontsize=18)\n",
- "plt.grid(True)\n",
- "\n",
- "save_fig(\"pca_best_projection_plot\")\n",
- "plt.show()"
+ "1 - pca.explained_variance_ratio_.sum() # not in the book"
]
},
{
@@ -805,60 +658,70 @@
"## Choosing the Right Number of Dimensions"
]
},
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "**Warning:** since Scikit-Learn 0.24, `fetch_openml()` returns a Pandas `DataFrame` by default. To avoid this and keep the same code as in the book, we set `as_frame=True`."
- ]
- },
{
"cell_type": "code",
- "execution_count": 31,
+ "execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.datasets import fetch_openml\n",
"\n",
- "mnist = fetch_openml('mnist_784', version=1, as_frame=False)\n",
- "mnist.target = mnist.target.astype(np.uint8)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 32,
- "metadata": {},
- "outputs": [],
- "source": [
- "from sklearn.model_selection import train_test_split\n",
+ "mnist = fetch_openml('mnist_784', as_frame=False)\n",
+ "X_train, y_train = mnist.data[:60_000], mnist.target[:60_000]\n",
+ "X_test, y_test = mnist.data[60_000:], mnist.target[60_000:]\n",
"\n",
- "X = mnist[\"data\"]\n",
- "y = mnist[\"target\"]\n",
- "\n",
- "X_train, X_test, y_train, y_test = train_test_split(X, y)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 33,
- "metadata": {},
- "outputs": [],
- "source": [
"pca = PCA()\n",
"pca.fit(X_train)\n",
"cumsum = np.cumsum(pca.explained_variance_ratio_)\n",
- "d = np.argmax(cumsum >= 0.95) + 1"
+ "d = np.argmax(cumsum >= 0.95) + 1 # d == 154"
]
},
{
"cell_type": "code",
- "execution_count": 34,
+ "execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"d"
]
},
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "pca = PCA(n_components=0.95)\n",
+ "X_reduced = pca.fit_transform(X_train)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "X_reduced_pca = X_reduced # not in the book (saved for comparison below)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "pca.n_components_"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "pca.explained_variance_ratio_.sum() # not in the book"
+ ]
+ },
{
"cell_type": "markdown",
"metadata": {},
@@ -868,7 +731,7 @@
},
{
"cell_type": "code",
- "execution_count": 35,
+ "execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
@@ -881,7 +744,7 @@
"plt.plot([0, d], [0.95, 0.95], \"k:\")\n",
"plt.plot(d, 0.95, \"ko\")\n",
"plt.annotate(\"Elbow\", xy=(65, 0.85), xytext=(70, 0.7),\n",
- " arrowprops=dict(arrowstyle=\"->\"), fontsize=16)\n",
+ " arrowprops=dict(arrowstyle=\"->\"))\n",
"plt.grid(True)\n",
"save_fig(\"explained_variance_plot\")\n",
"plt.show()"
@@ -889,30 +752,56 @@
},
{
"cell_type": "code",
- "execution_count": 36,
+ "execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
- "pca = PCA(n_components=0.95)\n",
- "X_reduced = pca.fit_transform(X_train)"
+ "from sklearn.ensemble import RandomForestClassifier\n",
+ "from sklearn.model_selection import RandomizedSearchCV\n",
+ "from sklearn.pipeline import make_pipeline\n",
+ "\n",
+ "clf = make_pipeline(PCA(random_state=42),\n",
+ " RandomForestClassifier(random_state=42))\n",
+ "param_distrib = {\n",
+ " \"pca__n_components\": np.arange(10, 80),\n",
+ " \"randomforestclassifier__n_estimators\": np.arange(50, 500)\n",
+ "}\n",
+ "rnd_search = RandomizedSearchCV(clf, param_distrib, n_iter=10, cv=3,\n",
+ " random_state=42)\n",
+ "rnd_search.fit(X_train[:1000], y_train[:1000])"
]
},
{
"cell_type": "code",
- "execution_count": 37,
+ "execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
- "pca.n_components_"
+ "print(rnd_search.best_params_)"
]
},
{
"cell_type": "code",
- "execution_count": 38,
+ "execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
- "np.sum(pca.explained_variance_ratio_)"
+ "from sklearn.linear_model import SGDClassifier\n",
+ "from sklearn.model_selection import GridSearchCV\n",
+ "\n",
+ "clf = make_pipeline(PCA(random_state=42), SGDClassifier())\n",
+ "param_grid = {\"pca__n_components\": np.arange(10, 80)}\n",
+ "grid_search = GridSearchCV(clf, param_grid, cv=3)\n",
+ "grid_search.fit(X_train[:1000], y_train[:1000])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 30,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "grid_search.best_params_"
]
},
{
@@ -924,12 +813,20 @@
},
{
"cell_type": "code",
- "execution_count": 39,
+ "execution_count": 31,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "pca = PCA(0.95)\n",
+ "X_reduced = pca.fit_transform(X_train, y_train)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 32,
"metadata": {},
"outputs": [],
"source": [
- "pca = PCA(n_components=154)\n",
- "X_reduced = pca.fit_transform(X_train)\n",
"X_recovered = pca.inverse_transform(X_reduced)"
]
},
@@ -942,61 +839,26 @@
},
{
"cell_type": "code",
- "execution_count": 40,
+ "execution_count": 33,
"metadata": {},
"outputs": [],
"source": [
- "# EXTRA\n",
- "def plot_digits(instances, images_per_row=5, **options):\n",
- " size = 28\n",
- " images_per_row = min(len(instances), images_per_row)\n",
- " # This is equivalent to n_rows = ceil(len(instances) / images_per_row):\n",
- " n_rows = (len(instances) - 1) // images_per_row + 1\n",
+ "# not in the book\n",
"\n",
- " # Append empty images to fill the end of the grid, if needed:\n",
- " n_empty = n_rows * images_per_row - len(instances)\n",
- " padded_instances = np.concatenate([instances, np.zeros((n_empty, size * size))], axis=0)\n",
- "\n",
- " # Reshape the array so it's organized as a grid containing 28×28 images:\n",
- " image_grid = padded_instances.reshape((n_rows, images_per_row, size, size))\n",
- "\n",
- " # Combine axes 0 and 2 (vertical image grid axis, and vertical image axis),\n",
- " # and axes 1 and 3 (horizontal axes). We first need to move the axes that we\n",
- " # want to combine next to each other, using transpose(), and only then we\n",
- " # can reshape:\n",
- " big_image = image_grid.transpose(0, 2, 1, 3).reshape(n_rows * size,\n",
- " images_per_row * size)\n",
- " # Now that we have a big image, we just need to show it:\n",
- " plt.imshow(big_image, cmap = mpl.cm.binary, **options)\n",
- " plt.axis(\"off\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 41,
- "metadata": {},
- "outputs": [],
- "source": [
"plt.figure(figsize=(7, 4))\n",
- "plt.subplot(121)\n",
- "plot_digits(X_train[::2100])\n",
- "plt.title(\"Original\", fontsize=16)\n",
- "plt.subplot(122)\n",
- "plot_digits(X_recovered[::2100])\n",
- "plt.title(\"Compressed\", fontsize=16)\n",
+ "for idx, X in enumerate((X_train[::2100], X_recovered[::2100])):\n",
+ " plt.subplot(1, 2, idx + 1)\n",
+ " plt.title([\"Original\", \"Compressed\"][idx])\n",
+ " for row in range(5):\n",
+ " for col in range(5):\n",
+ " plt.imshow(X[row * 5 + col].reshape(28, 28), cmap=\"binary\",\n",
+ " vmin=0, vmax=255, extent=(row, row + 1, col, col + 1))\n",
+ " plt.axis([0, 5, 0, 5])\n",
+ " plt.axis(\"off\")\n",
"\n",
"save_fig(\"mnist_compression_plot\")"
]
},
- {
- "cell_type": "code",
- "execution_count": 42,
- "metadata": {},
- "outputs": [],
- "source": [
- "X_reduced_pca = X_reduced"
- ]
- },
{
"cell_type": "markdown",
"metadata": {},
@@ -1006,7 +868,7 @@
},
{
"cell_type": "code",
- "execution_count": 43,
+ "execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
@@ -1023,7 +885,7 @@
},
{
"cell_type": "code",
- "execution_count": 44,
+ "execution_count": 35,
"metadata": {},
"outputs": [],
"source": [
@@ -1032,83 +894,11 @@
"n_batches = 100\n",
"inc_pca = IncrementalPCA(n_components=154)\n",
"for X_batch in np.array_split(X_train, n_batches):\n",
- " print(\".\", end=\"\") # not shown in the book\n",
" inc_pca.partial_fit(X_batch)\n",
"\n",
"X_reduced = inc_pca.transform(X_train)"
]
},
- {
- "cell_type": "code",
- "execution_count": 45,
- "metadata": {},
- "outputs": [],
- "source": [
- "X_recovered_inc_pca = inc_pca.inverse_transform(X_reduced)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Let's check that compression still works well:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 46,
- "metadata": {},
- "outputs": [],
- "source": [
- "plt.figure(figsize=(7, 4))\n",
- "plt.subplot(121)\n",
- "plot_digits(X_train[::2100])\n",
- "plt.subplot(122)\n",
- "plot_digits(X_recovered_inc_pca[::2100])\n",
- "plt.tight_layout()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 47,
- "metadata": {},
- "outputs": [],
- "source": [
- "X_reduced_inc_pca = X_reduced"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Let's compare the results of transforming MNIST using regular PCA and incremental PCA. First, the means are equal: "
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 48,
- "metadata": {},
- "outputs": [],
- "source": [
- "np.allclose(pca.mean_, inc_pca.mean_)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "But the results are not exactly identical. Incremental PCA gives a very good approximate solution, but it's not perfect:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 49,
- "metadata": {},
- "outputs": [],
- "source": [
- "np.allclose(X_reduced_pca, X_reduced_inc_pca)"
- ]
- },
{
"cell_type": "markdown",
"metadata": {},
@@ -1120,36 +910,19 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "Let's create the `memmap()` structure and copy the MNIST data into it. This would typically be done by a first program:"
+ "Let's create the `memmap()` structure, copy the MNIST training set into it, and call `flush()` which ensures that any data still in cache is saved to disk. This would typically be done by a first program:"
]
},
{
"cell_type": "code",
- "execution_count": 50,
+ "execution_count": 36,
"metadata": {},
"outputs": [],
"source": [
- "filename = \"my_mnist.data\"\n",
- "m, n = X_train.shape\n",
- "\n",
- "X_mm = np.memmap(filename, dtype='float32', mode='write', shape=(m, n))\n",
- "X_mm[:] = X_train"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Now deleting the `memmap()` object will trigger its Python finalizer, which ensures that the data is saved to disk."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 51,
- "metadata": {},
- "outputs": [],
- "source": [
- "del X_mm"
+ "filename = \"my_mnist.mmap\"\n",
+ "X_mmap = np.memmap(filename, dtype='float32', mode='write', shape=X_train.shape)\n",
+ "X_mmap[:] = X_train # could be a loop instead, saving the data chunk by chunk\n",
+ "X_mmap.flush()"
]
},
{
@@ -1161,305 +934,118 @@
},
{
"cell_type": "code",
- "execution_count": 52,
+ "execution_count": 37,
"metadata": {},
"outputs": [],
"source": [
- "X_mm = np.memmap(filename, dtype=\"float32\", mode=\"readonly\", shape=(m, n))\n",
- "\n",
- "batch_size = m // n_batches\n",
+ "X_mmap = np.memmap(filename, dtype=\"float32\", mode=\"readonly\").reshape(-1, 784)\n",
+ "batch_size = X_mmap.shape[0] // n_batches\n",
"inc_pca = IncrementalPCA(n_components=154, batch_size=batch_size)\n",
- "inc_pca.fit(X_mm)"
+ "inc_pca.fit(X_mmap)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "**Time complexity:**"
+ "# Random Projection"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "Let's time regular PCA against Incremental PCA and Randomized PCA, for various number of principal components:"
+ "**Warning**: this sections will use close to 2.5 GB of RAM. If your computer runs out of memory, just reduce _m_ and _n_:"
]
},
{
"cell_type": "code",
- "execution_count": 53,
+ "execution_count": 38,
"metadata": {},
"outputs": [],
"source": [
- "import time\n",
+ "from sklearn.random_projection import johnson_lindenstrauss_min_dim\n",
"\n",
- "for n_components in (2, 10, 154):\n",
- " print(\"n_components =\", n_components)\n",
- " regular_pca = PCA(n_components=n_components, svd_solver=\"full\")\n",
- " inc_pca = IncrementalPCA(n_components=n_components, batch_size=500)\n",
- " rnd_pca = PCA(n_components=n_components, random_state=42, svd_solver=\"randomized\")\n",
+ "m, ε = 5_000, 0.1\n",
+ "d = johnson_lindenstrauss_min_dim(m, eps=ε)\n",
+ "d"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 39,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# not in the book\n",
+ "d = int(4 * np.log(m) / (ε ** 2 / 2 - ε ** 3 / 3))\n",
+ "d"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 40,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "n = 20_000\n",
+ "np.random.seed(42)\n",
+ "P = np.random.randn(d, n) / np.sqrt(d) # std dev = square root of variance\n",
"\n",
- " for name, pca in ((\"PCA\", regular_pca), (\"Inc PCA\", inc_pca), (\"Rnd PCA\", rnd_pca)):\n",
- " t1 = time.time()\n",
- " pca.fit(X_train)\n",
- " t2 = time.time()\n",
- " print(\" {}: {:.1f} seconds\".format(name, t2 - t1))"
+ "X = np.random.randn(m, n) # generate a fake dataset\n",
+ "X_reduced = X @ P.T"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 41,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sklearn.random_projection import GaussianRandomProjection\n",
+ "\n",
+ "gaussian_rnd_proj = GaussianRandomProjection(eps=ε, random_state=42)\n",
+ "X_reduced = gaussian_rnd_proj.fit_transform(X) # same result as above"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "Now let's compare PCA and Randomized PCA for datasets of different sizes (number of instances):"
+ "**Warning**: the following cell may take several minutes to run:"
]
},
{
"cell_type": "code",
- "execution_count": 54,
+ "execution_count": 42,
"metadata": {},
"outputs": [],
"source": [
- "times_rpca = []\n",
- "times_pca = []\n",
- "sizes = [1000, 10000, 20000, 30000, 40000, 50000, 70000, 100000, 200000, 500000]\n",
- "for n_samples in sizes:\n",
- " X = np.random.randn(n_samples, 5)\n",
- " pca = PCA(n_components=2, svd_solver=\"randomized\", random_state=42)\n",
- " t1 = time.time()\n",
- " pca.fit(X)\n",
- " t2 = time.time()\n",
- " times_rpca.append(t2 - t1)\n",
- " pca = PCA(n_components=2, svd_solver=\"full\")\n",
- " t1 = time.time()\n",
- " pca.fit(X)\n",
- " t2 = time.time()\n",
- " times_pca.append(t2 - t1)\n",
- "\n",
- "plt.plot(sizes, times_rpca, \"b-o\", label=\"RPCA\")\n",
- "plt.plot(sizes, times_pca, \"r-s\", label=\"PCA\")\n",
- "plt.xlabel(\"n_samples\")\n",
- "plt.ylabel(\"Training time\")\n",
- "plt.legend(loc=\"upper left\")\n",
- "plt.title(\"PCA and Randomized PCA time complexity \")"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "And now let's compare their performance on datasets of 2,000 instances with various numbers of features:"
+ "components_pinv = np.linalg.pinv(gaussian_rnd_proj.components_)\n",
+ "X_recovered = X_reduced @ components_pinv.T"
]
},
{
"cell_type": "code",
- "execution_count": 55,
- "metadata": {
- "scrolled": true
- },
- "outputs": [],
- "source": [
- "times_rpca = []\n",
- "times_pca = []\n",
- "sizes = [1000, 2000, 3000, 4000, 5000, 6000]\n",
- "for n_features in sizes:\n",
- " X = np.random.randn(2000, n_features)\n",
- " pca = PCA(n_components=2, random_state=42, svd_solver=\"randomized\")\n",
- " t1 = time.time()\n",
- " pca.fit(X)\n",
- " t2 = time.time()\n",
- " times_rpca.append(t2 - t1)\n",
- " pca = PCA(n_components=2, svd_solver=\"full\")\n",
- " t1 = time.time()\n",
- " pca.fit(X)\n",
- " t2 = time.time()\n",
- " times_pca.append(t2 - t1)\n",
- "\n",
- "plt.plot(sizes, times_rpca, \"b-o\", label=\"RPCA\")\n",
- "plt.plot(sizes, times_pca, \"r-s\", label=\"PCA\")\n",
- "plt.xlabel(\"n_features\")\n",
- "plt.ylabel(\"Training time\")\n",
- "plt.legend(loc=\"upper left\")\n",
- "plt.title(\"PCA and Randomized PCA time complexity \")"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Kernel PCA"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 56,
+ "execution_count": 43,
"metadata": {},
"outputs": [],
"source": [
- "X, t = make_swiss_roll(n_samples=1000, noise=0.2, random_state=42)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 57,
- "metadata": {},
- "outputs": [],
- "source": [
- "from sklearn.decomposition import KernelPCA\n",
+ "# not in the book, performance comparison between Gaussian and Sparse RP\n",
"\n",
- "rbf_pca = KernelPCA(n_components = 2, kernel=\"rbf\", gamma=0.04)\n",
- "X_reduced = rbf_pca.fit_transform(X)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "**Code to generate Figure 8–10. Swiss roll reduced to 2D using kPCA with various kernels:**"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 58,
- "metadata": {},
- "outputs": [],
- "source": [
- "from sklearn.decomposition import KernelPCA\n",
+ "from sklearn.random_projection import SparseRandomProjection\n",
"\n",
- "lin_pca = KernelPCA(n_components = 2, kernel=\"linear\", fit_inverse_transform=True)\n",
- "rbf_pca = KernelPCA(n_components = 2, kernel=\"rbf\", gamma=0.0433, fit_inverse_transform=True)\n",
- "sig_pca = KernelPCA(n_components = 2, kernel=\"sigmoid\", gamma=0.001, coef0=1, fit_inverse_transform=True)\n",
+ "print(\"GaussianRandomProjection fit\")\n",
+ "%timeit GaussianRandomProjection(random_state=42).fit(X)\n",
+ "print(\"SparseRandomProjection fit\")\n",
+ "%timeit SparseRandomProjection(random_state=42).fit(X)\n",
"\n",
- "y = t > 6.9\n",
- "\n",
- "plt.figure(figsize=(11, 4))\n",
- "for subplot, pca, title in ((131, lin_pca, \"Linear kernel\"), (132, rbf_pca, \"RBF kernel, $\\gamma=0.04$\"), (133, sig_pca, \"Sigmoid kernel, $\\gamma=10^{-3}, r=1$\")):\n",
- " X_reduced = pca.fit_transform(X)\n",
- " if subplot == 132:\n",
- " X_reduced_rbf = X_reduced\n",
- " \n",
- " plt.subplot(subplot)\n",
- " #plt.plot(X_reduced[y, 0], X_reduced[y, 1], \"gs\")\n",
- " #plt.plot(X_reduced[~y, 0], X_reduced[~y, 1], \"y^\")\n",
- " plt.title(title, fontsize=14)\n",
- " plt.scatter(X_reduced[:, 0], X_reduced[:, 1], c=t, cmap=plt.cm.hot)\n",
- " plt.xlabel(\"$z_1$\", fontsize=18)\n",
- " if subplot == 131:\n",
- " plt.ylabel(\"$z_2$\", fontsize=18, rotation=0)\n",
- " plt.grid(True)\n",
- "\n",
- "save_fig(\"kernel_pca_plot\")\n",
- "plt.show()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "**Code to generate Figure 8–11. Kernel PCA and the reconstruction pre-image error:**"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 59,
- "metadata": {},
- "outputs": [],
- "source": [
- "plt.figure(figsize=(6, 5))\n",
- "\n",
- "X_inverse = rbf_pca.inverse_transform(X_reduced_rbf)\n",
- "\n",
- "ax = plt.subplot(111, projection='3d')\n",
- "ax.view_init(10, -70)\n",
- "ax.scatter(X_inverse[:, 0], X_inverse[:, 1], X_inverse[:, 2], c=t, cmap=plt.cm.hot, marker=\"x\")\n",
- "ax.set_xlabel(\"\")\n",
- "ax.set_ylabel(\"\")\n",
- "ax.set_zlabel(\"\")\n",
- "ax.set_xticklabels([])\n",
- "ax.set_yticklabels([])\n",
- "ax.set_zticklabels([])\n",
- "\n",
- "save_fig(\"preimage_plot\", tight_layout=False)\n",
- "plt.show()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 60,
- "metadata": {},
- "outputs": [],
- "source": [
- "X_reduced = rbf_pca.fit_transform(X)\n",
- "\n",
- "plt.figure(figsize=(11, 4))\n",
- "plt.subplot(132)\n",
- "plt.scatter(X_reduced[:, 0], X_reduced[:, 1], c=t, cmap=plt.cm.hot, marker=\"x\")\n",
- "plt.xlabel(\"$z_1$\", fontsize=18)\n",
- "plt.ylabel(\"$z_2$\", fontsize=18, rotation=0)\n",
- "plt.grid(True)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Selecting a Kernel and Tuning Hyperparameters"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 61,
- "metadata": {},
- "outputs": [],
- "source": [
- "from sklearn.model_selection import GridSearchCV\n",
- "from sklearn.linear_model import LogisticRegression\n",
- "from sklearn.pipeline import Pipeline\n",
- "\n",
- "clf = Pipeline([\n",
- " (\"kpca\", KernelPCA(n_components=2)),\n",
- " (\"log_reg\", LogisticRegression(solver=\"lbfgs\"))\n",
- " ])\n",
- "\n",
- "param_grid = [{\n",
- " \"kpca__gamma\": np.linspace(0.03, 0.05, 10),\n",
- " \"kpca__kernel\": [\"rbf\", \"sigmoid\"]\n",
- " }]\n",
- "\n",
- "grid_search = GridSearchCV(clf, param_grid, cv=3)\n",
- "grid_search.fit(X, y)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 62,
- "metadata": {},
- "outputs": [],
- "source": [
- "print(grid_search.best_params_)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 63,
- "metadata": {},
- "outputs": [],
- "source": [
- "rbf_pca = KernelPCA(n_components = 2, kernel=\"rbf\", gamma=0.0433,\n",
- " fit_inverse_transform=True)\n",
- "X_reduced = rbf_pca.fit_transform(X)\n",
- "X_preimage = rbf_pca.inverse_transform(X_reduced)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 64,
- "metadata": {},
- "outputs": [],
- "source": [
- "from sklearn.metrics import mean_squared_error\n",
- "\n",
- "mean_squared_error(X, X_preimage)"
+ "gaussian_rnd_proj = GaussianRandomProjection(random_state=42).fit(X)\n",
+ "sparse_rnd_proj = SparseRandomProjection(random_state=42).fit(X)\n",
+ "print(\"GaussianRandomProjection transform\")\n",
+ "%timeit gaussian_rnd_proj.transform(X)\n",
+ "print(\"SparseRandomProjection transform\")\n",
+ "%timeit sparse_rnd_proj.transform(X)"
]
},
{
@@ -1471,23 +1057,16 @@
},
{
"cell_type": "code",
- "execution_count": 65,
- "metadata": {},
- "outputs": [],
- "source": [
- "X, t = make_swiss_roll(n_samples=1000, noise=0.2, random_state=41)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 66,
+ "execution_count": 44,
"metadata": {},
"outputs": [],
"source": [
+ "from sklearn.datasets import make_swiss_roll\n",
"from sklearn.manifold import LocallyLinearEmbedding\n",
"\n",
+ "X_swiss, t = make_swiss_roll(n_samples=1000, noise=0.2, random_state=42)\n",
"lle = LocallyLinearEmbedding(n_components=2, n_neighbors=10, random_state=42)\n",
- "X_reduced = lle.fit_transform(X)"
+ "X_unrolled = lle.fit_transform(X_swiss)"
]
},
{
@@ -1499,15 +1078,16 @@
},
{
"cell_type": "code",
- "execution_count": 67,
+ "execution_count": 45,
"metadata": {},
"outputs": [],
"source": [
- "plt.title(\"Unrolled swiss roll using LLE\", fontsize=14)\n",
- "plt.scatter(X_reduced[:, 0], X_reduced[:, 1], c=t, cmap=plt.cm.hot)\n",
- "plt.xlabel(\"$z_1$\", fontsize=18)\n",
- "plt.ylabel(\"$z_2$\", fontsize=18)\n",
- "plt.axis([-0.065, 0.055, -0.1, 0.12])\n",
+ "plt.title(\"Unrolled swiss roll using LLE\")\n",
+ "plt.scatter(X_unrolled[:, 0], X_unrolled[:, 1],\n",
+ " c=t, cmap=darker_hot)\n",
+ "plt.xlabel(\"$z_1$\")\n",
+ "plt.ylabel(\"$z_2$\", rotation=0)\n",
+ "plt.axis([-0.055, 0.060, -0.070, 0.090])\n",
"plt.grid(True)\n",
"\n",
"save_fig(\"lle_unrolling_plot\")\n",
@@ -1515,61 +1095,55 @@
]
},
{
- "cell_type": "markdown",
+ "cell_type": "code",
+ "execution_count": 46,
"metadata": {},
+ "outputs": [],
"source": [
- "## Other Dimensionality Reduction Techniques"
+ "# not in the book: shows how well correlated z1 is to t: LLE worked fine\n",
+ "plt.title(\"$z_1$ vs $t$\")\n",
+ "plt.scatter(X_unrolled[:, 0], t, c=t, cmap=darker_hot)\n",
+ "plt.xlabel(\"$z_1$\")\n",
+ "plt.ylabel(\"$t$\", rotation=0)\n",
+ "plt.grid(True)\n",
+ "plt.show()"
]
},
{
"cell_type": "code",
- "execution_count": 68,
+ "execution_count": 47,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.manifold import MDS\n",
"\n",
"mds = MDS(n_components=2, random_state=42)\n",
- "X_reduced_mds = mds.fit_transform(X)"
+ "X_reduced_mds = mds.fit_transform(X_swiss)"
]
},
{
"cell_type": "code",
- "execution_count": 69,
+ "execution_count": 48,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.manifold import Isomap\n",
"\n",
"isomap = Isomap(n_components=2)\n",
- "X_reduced_isomap = isomap.fit_transform(X)"
+ "X_reduced_isomap = isomap.fit_transform(X_swiss)"
]
},
{
"cell_type": "code",
- "execution_count": 70,
+ "execution_count": 49,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.manifold import TSNE\n",
"\n",
- "tsne = TSNE(n_components=2, random_state=42)\n",
- "X_reduced_tsne = tsne.fit_transform(X)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 71,
- "metadata": {},
- "outputs": [],
- "source": [
- "from sklearn.discriminant_analysis import LinearDiscriminantAnalysis\n",
- "\n",
- "lda = LinearDiscriminantAnalysis(n_components=2)\n",
- "X_mnist = mnist[\"data\"]\n",
- "y_mnist = mnist[\"target\"]\n",
- "lda.fit(X_mnist, y_mnist)\n",
- "X_reduced_lda = lda.transform(X_mnist)"
+ "tsne = TSNE(n_components=2, init=\"random\", learning_rate=\"auto\",\n",
+ " random_state=42)\n",
+ "X_reduced_tsne = tsne.fit_transform(X_swiss)"
]
},
{
@@ -1581,7 +1155,7 @@
},
{
"cell_type": "code",
- "execution_count": 72,
+ "execution_count": 50,
"metadata": {},
"outputs": [],
"source": [
@@ -1592,17 +1166,67 @@
"for subplot, title, X_reduced in zip((131, 132, 133), titles,\n",
" (X_reduced_mds, X_reduced_isomap, X_reduced_tsne)):\n",
" plt.subplot(subplot)\n",
- " plt.title(title, fontsize=14)\n",
- " plt.scatter(X_reduced[:, 0], X_reduced[:, 1], c=t, cmap=plt.cm.hot)\n",
- " plt.xlabel(\"$z_1$\", fontsize=18)\n",
+ " plt.title(title)\n",
+ " plt.scatter(X_reduced[:, 0], X_reduced[:, 1], c=t, cmap=darker_hot)\n",
+ " plt.xlabel(\"$z_1$\")\n",
" if subplot == 131:\n",
- " plt.ylabel(\"$z_2$\", fontsize=18, rotation=0)\n",
+ " plt.ylabel(\"$z_2$\", rotation=0)\n",
" plt.grid(True)\n",
"\n",
"save_fig(\"other_dim_reduction_plot\")\n",
"plt.show()"
]
},
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Extra Material – Kernel PCA"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 51,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sklearn.decomposition import KernelPCA\n",
+ "\n",
+ "rbf_pca = KernelPCA(n_components=2, kernel=\"rbf\", gamma=0.04, random_state=42)\n",
+ "X_reduced = rbf_pca.fit_transform(X_swiss)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 52,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "lin_pca = KernelPCA(kernel=\"linear\")\n",
+ "rbf_pca = KernelPCA(kernel=\"rbf\", gamma=0.002)\n",
+ "sig_pca = KernelPCA(kernel=\"sigmoid\", gamma=0.002, coef0=1)\n",
+ "\n",
+ "kernel_pcas = ((lin_pca, \"Linear kernel\"),\n",
+ " (rbf_pca, rf\"RBF kernel, $\\gamma={rbf_pca.gamma}$\"),\n",
+ " (sig_pca, rf\"Sigmoid kernel, $\\gamma={sig_pca.gamma}, r={sig_pca.coef0}$\"))\n",
+ "\n",
+ "plt.figure(figsize=(11, 3.5))\n",
+ "for idx, (kpca, title) in enumerate(kernel_pcas):\n",
+ " kpca.n_components = 2\n",
+ " kpca.random_state = 42\n",
+ " X_reduced = kpca.fit_transform(X_swiss)\n",
+ "\n",
+ " plt.subplot(1, 3, idx + 1)\n",
+ " plt.title(title)\n",
+ " plt.scatter(X_reduced[:, 0], X_reduced[:, 1], c=t, cmap=darker_hot)\n",
+ " plt.xlabel(\"$z_1$\")\n",
+ " if idx == 0:\n",
+ " plt.ylabel(\"$z_2$\", rotation=0)\n",
+ " plt.grid()\n",
+ "\n",
+ "plt.show()"
+ ]
+ },
{
"cell_type": "markdown",
"metadata": {},
@@ -1635,7 +1259,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "*Exercise: Load the MNIST dataset (introduced in chapter 3) and split it into a training set and a test set (take the first 60,000 instances for training, and the remaining 10,000 for testing).*"
+ "Exercise: _Load the MNIST dataset (introduced in chapter 3) and split it into a training set and a test set (take the first 60,000 instances for training, and the remaining 10,000 for testing)._"
]
},
{
@@ -1647,60 +1271,45 @@
},
{
"cell_type": "code",
- "execution_count": 73,
+ "execution_count": 53,
"metadata": {},
"outputs": [],
"source": [
- "X_train = mnist['data'][:60000]\n",
- "y_train = mnist['target'][:60000]\n",
+ "X_train = mnist.data[:60000]\n",
+ "y_train = mnist.target[:60000]\n",
"\n",
- "X_test = mnist['data'][60000:]\n",
- "y_test = mnist['target'][60000:]"
+ "X_test = mnist.data[60000:]\n",
+ "y_test = mnist.target[60000:]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "*Exercise: Train a Random Forest classifier on the dataset and time how long it takes, then evaluate the resulting model on the test set.*"
+ "Exercise: _Train a Random Forest classifier on the dataset and time how long it takes, then evaluate the resulting model on the test set._"
]
},
{
"cell_type": "code",
- "execution_count": 74,
+ "execution_count": 54,
"metadata": {},
"outputs": [],
"source": [
- "from sklearn.ensemble import RandomForestClassifier\n",
- "\n",
"rnd_clf = RandomForestClassifier(n_estimators=100, random_state=42)"
]
},
{
"cell_type": "code",
- "execution_count": 75,
+ "execution_count": 55,
"metadata": {},
"outputs": [],
"source": [
- "import time\n",
- "\n",
- "t0 = time.time()\n",
- "rnd_clf.fit(X_train, y_train)\n",
- "t1 = time.time()"
+ "%time rnd_clf.fit(X_train, y_train)"
]
},
{
"cell_type": "code",
- "execution_count": 76,
- "metadata": {},
- "outputs": [],
- "source": [
- "print(\"Training took {:.2f}s\".format(t1 - t0))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 77,
+ "execution_count": 56,
"metadata": {},
"outputs": [],
"source": [
@@ -1714,12 +1323,12 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "*Exercise: Next, use PCA to reduce the dataset's dimensionality, with an explained variance ratio of 95%.*"
+ "Exercise: _Next, use PCA to reduce the dataset's dimensionality, with an explained variance ratio of 95%._"
]
},
{
"cell_type": "code",
- "execution_count": 78,
+ "execution_count": 57,
"metadata": {},
"outputs": [],
"source": [
@@ -1733,53 +1342,42 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "*Exercise: Train a new Random Forest classifier on the reduced dataset and see how long it takes. Was training much faster?*"
+ "Exercise: _Train a new Random Forest classifier on the reduced dataset and see how long it takes. Was training much faster?_"
]
},
{
"cell_type": "code",
- "execution_count": 79,
+ "execution_count": 58,
"metadata": {},
"outputs": [],
"source": [
- "rnd_clf2 = RandomForestClassifier(n_estimators=100, random_state=42)\n",
- "t0 = time.time()\n",
- "rnd_clf2.fit(X_train_reduced, y_train)\n",
- "t1 = time.time()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 80,
- "metadata": {},
- "outputs": [],
- "source": [
- "print(\"Training took {:.2f}s\".format(t1 - t0))"
+ "rnd_clf_with_pca = RandomForestClassifier(n_estimators=100, random_state=42)\n",
+ "%time rnd_clf_with_pca.fit(X_train_reduced, y_train)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "Oh no! Training is actually more than twice slower now! How can that be? Well, as we saw in this chapter, dimensionality reduction does not always lead to faster training time: it depends on the dataset, the model and the training algorithm. See figure 8-6 (the `manifold_decision_boundary_plot*` plots above). If you try a softmax classifier instead of a random forest classifier, you will find that training time is reduced by a factor of 3 when using PCA. Actually, we will do this in a second, but first let's check the precision of the new random forest classifier."
+ "Oh no! Training is actually about twice slower now! How can that be? Well, as we saw in this chapter, dimensionality reduction does not always lead to faster training time: it depends on the dataset, the model and the training algorithm. See figure 8-6 (the `manifold_decision_boundary_plot*` plots above). If you try `SGDClassifier` instead of `RandomForestClassifier`, you will find that training time is reduced by a factor of 3 when using PCA. Actually, we will do this in a second, but first let's check the precision of the new random forest classifier."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "*Exercise: Next evaluate the classifier on the test set: how does it compare to the previous classifier?*"
+ "Exercise: _Next evaluate the classifier on the test set: how does it compare to the previous classifier?_"
]
},
{
"cell_type": "code",
- "execution_count": 81,
+ "execution_count": 59,
"metadata": {},
"outputs": [],
"source": [
"X_test_reduced = pca.transform(X_test)\n",
"\n",
- "y_pred = rnd_clf2.predict(X_test_reduced)\n",
+ "y_pred = rnd_clf_with_pca.predict(X_test_reduced)\n",
"accuracy_score(y_test, y_pred)"
]
},
@@ -1787,41 +1385,35 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "It is common for performance to drop slightly when reducing dimensionality, because we do lose some useful signal in the process. However, the performance drop is rather severe in this case. So PCA really did not help: it slowed down training and reduced performance. :(\n",
+ "It is common for performance to drop slightly when reducing dimensionality, because we do lose some potentially useful signal in the process. However, the performance drop is rather severe in this case. So PCA really did not help: it slowed down training *and* reduced performance. 😭"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Exercise: _Try again with an `SGDClassifier`. How much does PCA help now?_"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 60,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sklearn.linear_model import SGDClassifier\n",
"\n",
- "Let's see if it helps when using softmax regression:"
+ "sgd_clf = SGDClassifier(random_state=42)\n",
+ "%time sgd_clf.fit(X_train, y_train)"
]
},
{
"cell_type": "code",
- "execution_count": 82,
+ "execution_count": 61,
"metadata": {},
"outputs": [],
"source": [
- "from sklearn.linear_model import LogisticRegression\n",
- "\n",
- "log_clf = LogisticRegression(multi_class=\"multinomial\", solver=\"lbfgs\", random_state=42)\n",
- "t0 = time.time()\n",
- "log_clf.fit(X_train, y_train)\n",
- "t1 = time.time()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 83,
- "metadata": {},
- "outputs": [],
- "source": [
- "print(\"Training took {:.2f}s\".format(t1 - t0))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 84,
- "metadata": {},
- "outputs": [],
- "source": [
- "y_pred = log_clf.predict(X_test)\n",
+ "y_pred = sgd_clf.predict(X_test)\n",
"accuracy_score(y_test, y_pred)"
]
},
@@ -1829,44 +1421,33 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "Okay, so softmax regression takes much longer to train on this dataset than the random forest classifier, plus it performs worse on the test set. But that's not what we are interested in right now, we want to see how much PCA can help softmax regression. Let's train the softmax regression model using the reduced dataset:"
+ "Okay, so the `SGDClassifier` takes much longer to train on this dataset than the `RandomForestClassifier`, plus it performs worse on the test set. But that's not what we are interested in right now, we want to see how much PCA can help `SGDClassifier`. Let's train it using the reduced dataset:"
]
},
{
"cell_type": "code",
- "execution_count": 85,
+ "execution_count": 62,
"metadata": {},
"outputs": [],
"source": [
- "log_clf2 = LogisticRegression(multi_class=\"multinomial\", solver=\"lbfgs\", random_state=42)\n",
- "t0 = time.time()\n",
- "log_clf2.fit(X_train_reduced, y_train)\n",
- "t1 = time.time()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 86,
- "metadata": {},
- "outputs": [],
- "source": [
- "print(\"Training took {:.2f}s\".format(t1 - t0))"
+ "sgd_clf_with_pca = SGDClassifier(random_state=42)\n",
+ "%time sgd_clf_with_pca.fit(X_train_reduced, y_train)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "Nice! Reducing dimensionality led to over 2× speedup. :) Let's check the model's accuracy:"
+ "Nice! Reducing dimensionality led to roughly 5× speedup. :) Let's check the model's accuracy:"
]
},
{
"cell_type": "code",
- "execution_count": 87,
+ "execution_count": 63,
"metadata": {},
"outputs": [],
"source": [
- "y_pred = log_clf2.predict(X_test_reduced)\n",
+ "y_pred = sgd_clf_with_pca.predict(X_test_reduced)\n",
"accuracy_score(y_test, y_pred)"
]
},
@@ -1874,14 +1455,14 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "A very slight drop in performance, which might be a reasonable price to pay for a 2× speedup, depending on the application."
+ "Great! PCA not only gave us a 5× speed boost, it also improved performance slightly."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "So there you have it: PCA can give you a formidable speedup... but not always!"
+ "So there you have it: PCA can give you a formidable speedup, and if you're lucky a performance boost... but it's really not guaranteed: it depends on the model and the dataset!"
]
},
{
@@ -1895,55 +1476,43 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "*Exercise: Use t-SNE to reduce the MNIST dataset down to two dimensions and plot the result using Matplotlib. You can use a scatterplot using 10 different colors to represent each image's target class.*"
+ "Exercise: _Use t-SNE to reduce the first 5,000 images of the MNIST dataset down to two dimensions and plot the result using Matplotlib. You can use a scatterplot using 10 different colors to represent each image's target class._"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "The MNIST dataset was loaded above."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Dimensionality reduction on the full 60,000 images takes a very long time, so let's only do this on a random subset of 10,000 images:"
+ "Let's limit ourselves to the first 5,000 images of the MNIST training set, to speed things up a lot."
]
},
{
"cell_type": "code",
- "execution_count": 88,
+ "execution_count": 64,
"metadata": {},
"outputs": [],
"source": [
- "np.random.seed(42)\n",
- "\n",
- "m = 10000\n",
- "idx = np.random.permutation(60000)[:m]\n",
- "\n",
- "X = mnist['data'][idx]\n",
- "y = mnist['target'][idx]"
+ "X_sample, y_sample = X_train[:5000], y_train[:5000]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "Now let's use t-SNE to reduce dimensionality down to 2D so we can plot the dataset:"
+ "Let's use t-SNE to reduce dimensionality down to 2D so we can plot the dataset:"
]
},
{
"cell_type": "code",
- "execution_count": 89,
+ "execution_count": 65,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.manifold import TSNE\n",
"\n",
- "tsne = TSNE(n_components=2, random_state=42)\n",
- "X_reduced = tsne.fit_transform(X)"
+ "tsne = TSNE(n_components=2, init=\"random\", learning_rate=\"auto\",\n",
+ " random_state=42)\n",
+ "%time X_reduced = tsne.fit_transform(X_sample)"
]
},
{
@@ -1955,12 +1524,13 @@
},
{
"cell_type": "code",
- "execution_count": 90,
+ "execution_count": 66,
"metadata": {},
"outputs": [],
"source": [
- "plt.figure(figsize=(13,10))\n",
- "plt.scatter(X_reduced[:, 0], X_reduced[:, 1], c=y, cmap=\"jet\")\n",
+ "plt.figure(figsize=(13, 10))\n",
+ "plt.scatter(X_reduced[:, 0], X_reduced[:, 1],\n",
+ " c=y_sample.astype(np.int8), cmap=\"jet\", alpha=0.5)\n",
"plt.axis('off')\n",
"plt.colorbar()\n",
"plt.show()"
@@ -1970,26 +1540,27 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "Isn't this just beautiful? :) This plot tells us which numbers are easily distinguishable from the others (e.g., 0s, 6s, and most 8s are rather well separated clusters), and it also tells us which numbers are often hard to distinguish (e.g., 4s and 9s, 5s and 3s, and so on)."
+ "Isn't this just beautiful? :) Most digits are nicely separated from the others, even though t-SNE wasn't given the targets: it just identified clusters of similar images. But there is still a bit of overlap. For example, the 3s and the 5s overlap a lot (on the right side of the plot), and so do the 4s and the 9s (in the top-right corner)."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "Let's focus on digits 2, 3 and 5, which seem to overlap a lot."
+ "Let's focus on just the digits 4 and 9:"
]
},
{
"cell_type": "code",
- "execution_count": 91,
+ "execution_count": 67,
"metadata": {},
"outputs": [],
"source": [
- "plt.figure(figsize=(9,9))\n",
- "cmap = mpl.cm.get_cmap(\"jet\")\n",
- "for digit in (2, 3, 5):\n",
- " plt.scatter(X_reduced[y == digit, 0], X_reduced[y == digit, 1], c=[cmap(digit / 9)])\n",
+ "plt.figure(figsize=(9, 9))\n",
+ "cmap = plt.cm.jet\n",
+ "for digit in ('4', '9'):\n",
+ " plt.scatter(X_reduced[y_sample == digit, 0], X_reduced[y_sample == digit, 1],\n",
+ " c=[cmap(float(digit) / 9)], alpha=0.5)\n",
"plt.axis('off')\n",
"plt.show()"
]
@@ -1998,32 +1569,35 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "Let's see if we can produce a nicer image by running t-SNE on these 3 digits:"
+ "Let's see if we can produce a nicer image by running t-SNE on just these 2 digits:"
]
},
{
"cell_type": "code",
- "execution_count": 92,
+ "execution_count": 68,
"metadata": {},
"outputs": [],
"source": [
- "idx = (y == 2) | (y == 3) | (y == 5) \n",
- "X_subset = X[idx]\n",
- "y_subset = y[idx]\n",
+ "idx = (y_sample == '4') | (y_sample == '9')\n",
+ "X_subset = X_sample[idx]\n",
+ "y_subset = y_sample[idx]\n",
"\n",
- "tsne_subset = TSNE(n_components=2, random_state=42)\n",
+ "tsne_subset = TSNE(n_components=2, init=\"random\", learning_rate=\"auto\",\n",
+ " random_state=42)\n",
"X_subset_reduced = tsne_subset.fit_transform(X_subset)"
]
},
{
"cell_type": "code",
- "execution_count": 93,
+ "execution_count": 69,
"metadata": {},
"outputs": [],
"source": [
- "plt.figure(figsize=(9,9))\n",
- "for digit in (2, 3, 5):\n",
- " plt.scatter(X_subset_reduced[y_subset == digit, 0], X_subset_reduced[y_subset == digit, 1], c=[cmap(digit / 9)])\n",
+ "plt.figure(figsize=(9, 9))\n",
+ "for digit in ('4', '9'):\n",
+ " plt.scatter(X_subset_reduced[y_subset == digit, 0],\n",
+ " X_subset_reduced[y_subset == digit, 1],\n",
+ " c=[cmap(float(digit) / 9)], alpha=0.5)\n",
"plt.axis('off')\n",
"plt.show()"
]
@@ -2032,14 +1606,14 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "Much better, now the clusters have far less overlap. But some 3s are all over the place. Plus, there are two distinct clusters of 2s, and also two distinct clusters of 5s. It would be nice if we could visualize a few digits from each cluster, to understand why this is the case. Let's do that now. "
+ "That's much better, although there's still a bit of overlap. Perhaps some 4s really do look like 9s, and vice versa. It would be nice if we could visualize a few digits from each region of this plot, to understand what's going on. In fact, let's do that now."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "*Exercise: Alternatively, you can write colored digits at the location of each instance, or even plot scaled-down versions of the digit images themselves (if you plot all digits, the visualization will be too cluttered, so you should either draw a random sample or plot an instance only if no other instance has already been plotted at a close distance). You should get a nice visualization with well-separated clusters of digits.*"
+ "Exercise: _Alternatively, you can write colored digits at the location of each instance, or even plot scaled-down versions of the digit images themselves (if you plot all digits, the visualization will be too cluttered, so you should either draw a random sample or plot an instance only if no other instance has already been plotted at a close distance). You should get a nice visualization with well-separated clusters of digits._"
]
},
{
@@ -2051,14 +1625,14 @@
},
{
"cell_type": "code",
- "execution_count": 94,
+ "execution_count": 70,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.preprocessing import MinMaxScaler\n",
"from matplotlib.offsetbox import AnnotationBbox, OffsetImage\n",
"\n",
- "def plot_digits(X, y, min_distance=0.05, images=None, figsize=(13, 10)):\n",
+ "def plot_digits(X, y, min_distance=0.04, images=None, figsize=(13, 10)):\n",
" # Let's scale the input features so that they range from 0 to 1\n",
" X_normalized = MinMaxScaler().fit_transform(X)\n",
" # Now we create the list of coordinates of the digits plotted so far.\n",
@@ -2067,22 +1641,25 @@
" neighbors = np.array([[10., 10.]])\n",
" # The rest should be self-explanatory\n",
" plt.figure(figsize=figsize)\n",
- " cmap = mpl.cm.get_cmap(\"jet\")\n",
+ " cmap = plt.cm.jet\n",
" digits = np.unique(y)\n",
" for digit in digits:\n",
- " plt.scatter(X_normalized[y == digit, 0], X_normalized[y == digit, 1], c=[cmap(digit / 9)])\n",
+ " plt.scatter(X_normalized[y == digit, 0], X_normalized[y == digit, 1],\n",
+ " c=[cmap(float(digit) / 9)], alpha=0.5)\n",
" plt.axis(\"off\")\n",
- " ax = plt.gcf().gca() # get current axes in current figure\n",
+ " ax = plt.gca() # get current axes\n",
" for index, image_coord in enumerate(X_normalized):\n",
" closest_distance = np.linalg.norm(neighbors - image_coord, axis=1).min()\n",
" if closest_distance > min_distance:\n",
" neighbors = np.r_[neighbors, [image_coord]]\n",
" if images is None:\n",
" plt.text(image_coord[0], image_coord[1], str(int(y[index])),\n",
- " color=cmap(y[index] / 9), fontdict={\"weight\": \"bold\", \"size\": 16})\n",
+ " color=cmap(float(y[index]) / 9),\n",
+ " fontdict={\"weight\": \"bold\", \"size\": 16})\n",
" else:\n",
" image = images[index].reshape(28, 28)\n",
- " imagebox = AnnotationBbox(OffsetImage(image, cmap=\"binary\"), image_coord)\n",
+ " imagebox = AnnotationBbox(OffsetImage(image, cmap=\"binary\"),\n",
+ " image_coord)\n",
" ax.add_artist(imagebox)"
]
},
@@ -2090,16 +1667,16 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "Let's try it! First let's just write colored digits:"
+ "Let's try it! First let's show colored digits (not images), for all 5,000 images:"
]
},
{
"cell_type": "code",
- "execution_count": 95,
+ "execution_count": 71,
"metadata": {},
"outputs": [],
"source": [
- "plot_digits(X_reduced, y)"
+ "plot_digits(X_reduced, y_sample)"
]
},
{
@@ -2111,16 +1688,23 @@
},
{
"cell_type": "code",
- "execution_count": 96,
+ "execution_count": 72,
"metadata": {},
"outputs": [],
"source": [
- "plot_digits(X_reduced, y, images=X, figsize=(35, 25))"
+ "plot_digits(X_reduced, y_sample, images=X_sample, figsize=(35, 25))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "That's nicer! Now let's focus on just the 3s and the 5s:"
]
},
{
"cell_type": "code",
- "execution_count": 97,
+ "execution_count": 73,
"metadata": {},
"outputs": [],
"source": [
@@ -2131,7 +1715,14 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "*Exercise: Try using other dimensionality reduction algorithms such as PCA, LLE, or MDS and compare the resulting visualizations.*"
+ "Notice how similar-looking 4s are grouped together. For example, the 4s get more and more inclined as they approach the top of the figure. The inclined 9s are also closer to the top. Some 4s really do look like 9s, and vice versa."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Exercise: _Try using other dimensionality reduction algorithms such as PCA, LLE, or MDS and compare the resulting visualizations._"
]
},
{
@@ -2143,18 +1734,12 @@
},
{
"cell_type": "code",
- "execution_count": 98,
+ "execution_count": 74,
"metadata": {},
"outputs": [],
"source": [
- "from sklearn.decomposition import PCA\n",
- "import time\n",
- "\n",
- "t0 = time.time()\n",
- "X_pca_reduced = PCA(n_components=2, random_state=42).fit_transform(X)\n",
- "t1 = time.time()\n",
- "print(\"PCA took {:.1f}s.\".format(t1 - t0))\n",
- "plot_digits(X_pca_reduced, y)\n",
+ "%time X_pca_reduced = PCA(n_components=2).fit_transform(X_sample)\n",
+ "plot_digits(X_pca_reduced, y_sample)\n",
"plt.show()"
]
},
@@ -2167,17 +1752,13 @@
},
{
"cell_type": "code",
- "execution_count": 99,
+ "execution_count": 75,
"metadata": {},
"outputs": [],
"source": [
- "from sklearn.manifold import LocallyLinearEmbedding\n",
- "\n",
- "t0 = time.time()\n",
- "X_lle_reduced = LocallyLinearEmbedding(n_components=2, random_state=42).fit_transform(X)\n",
- "t1 = time.time()\n",
- "print(\"LLE took {:.1f}s.\".format(t1 - t0))\n",
- "plot_digits(X_lle_reduced, y)\n",
+ "lle = LocallyLinearEmbedding(n_components=2, random_state=42)\n",
+ "%time X_lle_reduced = lle.fit_transform(X_sample)\n",
+ "plot_digits(X_lle_reduced, y_sample)\n",
"plt.show()"
]
},
@@ -2185,57 +1766,52 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "That took a while, and the result does not look too good. Let's see what happens if we apply PCA first, preserving 95% of the variance:"
+ "That took more time, and yet the result does not look good at all. Let's see what happens if we apply PCA first, preserving 95% of the variance:"
]
},
{
"cell_type": "code",
- "execution_count": 100,
+ "execution_count": 76,
"metadata": {},
"outputs": [],
"source": [
- "from sklearn.pipeline import Pipeline\n",
+ "pca_lle = make_pipeline(PCA(n_components=0.95),\n",
+ " LocallyLinearEmbedding(n_components=2, random_state=42))\n",
"\n",
- "pca_lle = Pipeline([\n",
- " (\"pca\", PCA(n_components=0.95, random_state=42)),\n",
- " (\"lle\", LocallyLinearEmbedding(n_components=2, random_state=42)),\n",
- "])\n",
- "t0 = time.time()\n",
- "X_pca_lle_reduced = pca_lle.fit_transform(X)\n",
- "t1 = time.time()\n",
- "print(\"PCA+LLE took {:.1f}s.\".format(t1 - t0))\n",
- "plot_digits(X_pca_lle_reduced, y)\n",
- "plt.show()"
+ "%time X_pca_lle_reduced = pca_lle.fit_transform(X_sample)\n",
+ "plot_digits(X_pca_lle_reduced, y_sample)\n",
+ "plt.show()tight_layout="
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "The result is more or less the same, but this time it was almost 4× faster."
+ "The result is more or less as bad, but this time training was a bit faster."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "Let's try MDS. It's much too long if we run it on 10,000 instances, so let's just try 2,000 for now:"
+ "Let's try MDS:"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**Warning**: the following cell will take about 10-15 minutes to run, depending on your hardware:"
]
},
{
"cell_type": "code",
- "execution_count": 101,
+ "execution_count": 77,
"metadata": {},
"outputs": [],
"source": [
- "from sklearn.manifold import MDS\n",
- "\n",
- "m = 2000\n",
- "t0 = time.time()\n",
- "X_mds_reduced = MDS(n_components=2, random_state=42).fit_transform(X[:m])\n",
- "t1 = time.time()\n",
- "print(\"MDS took {:.1f}s (on just 2,000 MNIST images instead of 10,000).\".format(t1 - t0))\n",
- "plot_digits(X_mds_reduced, y[:m])\n",
+ "%time X_mds_reduced = MDS(n_components=2, random_state=42).fit_transform(X_sample)\n",
+ "plot_digits(X_mds_reduced, y_sample)\n",
"plt.show()"
]
},
@@ -2246,23 +1822,24 @@
"Meh. This does not look great, all clusters overlap too much. Let's try with PCA first, perhaps it will be faster?"
]
},
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**Warning**: the following cell will take about 10-15 minutes to run, depending on your hardware:"
+ ]
+ },
{
"cell_type": "code",
- "execution_count": 102,
+ "execution_count": 78,
"metadata": {},
"outputs": [],
"source": [
- "from sklearn.pipeline import Pipeline\n",
+ "pca_mds = make_pipeline(PCA(n_components=0.95, random_state=42),\n",
+ " MDS(n_components=2, random_state=42))\n",
"\n",
- "pca_mds = Pipeline([\n",
- " (\"pca\", PCA(n_components=0.95, random_state=42)),\n",
- " (\"mds\", MDS(n_components=2, random_state=42)),\n",
- "])\n",
- "t0 = time.time()\n",
- "X_pca_mds_reduced = pca_mds.fit_transform(X[:2000])\n",
- "t1 = time.time()\n",
- "print(\"PCA+MDS took {:.1f}s (on 2,000 MNIST images).\".format(t1 - t0))\n",
- "plot_digits(X_pca_mds_reduced, y[:2000])\n",
+ "%time X_pca_mds_reduced = pca_mds.fit_transform(X_sample)\n",
+ "plot_digits(X_pca_mds_reduced, y_sample)\n",
"plt.show()"
]
},
@@ -2270,29 +1847,27 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "Same result, and no speedup: PCA did not help (or hurt)."
+ "Same result, and not faster: PCA did not help in this case."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "Let's try LDA:"
+ "Let's try LDA now:"
]
},
{
"cell_type": "code",
- "execution_count": 103,
+ "execution_count": 79,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.discriminant_analysis import LinearDiscriminantAnalysis\n",
"\n",
- "t0 = time.time()\n",
- "X_lda_reduced = LinearDiscriminantAnalysis(n_components=2).fit_transform(X, y)\n",
- "t1 = time.time()\n",
- "print(\"LDA took {:.1f}s.\".format(t1 - t0))\n",
- "plot_digits(X_lda_reduced, y, figsize=(12,12))\n",
+ "lda = LinearDiscriminantAnalysis(n_components=2)\n",
+ "%time X_lda_reduced = lda.fit_transform(X_sample, y_sample)\n",
+ "plot_digits(X_lda_reduced, y_sample, figsize=(12,12))\n",
"plt.show()"
]
},
@@ -2307,55 +1882,14 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "Well, it's pretty clear that t-SNE won this little competition, wouldn't you agree? We did not time it, so let's do that now:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 104,
- "metadata": {},
- "outputs": [],
- "source": [
- "from sklearn.manifold import TSNE\n",
- "\n",
- "t0 = time.time()\n",
- "X_tsne_reduced = TSNE(n_components=2, random_state=42).fit_transform(X)\n",
- "t1 = time.time()\n",
- "print(\"t-SNE took {:.1f}s.\".format(t1 - t0))\n",
- "plot_digits(X_tsne_reduced, y)\n",
- "plt.show()"
+ "Well, it's pretty clear that t-SNE won this little competition, wouldn't you agree?"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "It's twice slower than LLE, but still much faster than MDS, and the result looks great. Let's see if a bit of PCA can speed it up:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 105,
- "metadata": {},
- "outputs": [],
- "source": [
- "pca_tsne = Pipeline([\n",
- " (\"pca\", PCA(n_components=0.95, random_state=42)),\n",
- " (\"tsne\", TSNE(n_components=2, random_state=42)),\n",
- "])\n",
- "t0 = time.time()\n",
- "X_pca_tsne_reduced = pca_tsne.fit_transform(X)\n",
- "t1 = time.time()\n",
- "print(\"PCA+t-SNE took {:.1f}s.\".format(t1 - t0))\n",
- "plot_digits(X_pca_tsne_reduced, y)\n",
- "plt.show()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Yes, PCA roughly gave us over 2x speedup, without damaging the result. We have a winner!"
+ "And that's all for today, I hope you enjoyed this chapter!"
]
},
{
@@ -2368,7 +1902,7 @@
],
"metadata": {
"kernelspec": {
- "display_name": "Python 3 (ipykernel)",
+ "display_name": "Python 3",
"language": "python",
"name": "python3"
},