From 3e7fe8e2ff579cadb6d325d1b440a64376ed7369 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Geron?= Date: Fri, 19 Nov 2021 23:36:04 +1300 Subject: [PATCH] Improve 3D dataset and add missing random_state=42 for PCA in exercises --- 07_dimensionality_reduction.ipynb | 214 +++++++++++++++--------------- 1 file changed, 106 insertions(+), 108 deletions(-) diff --git a/07_dimensionality_reduction.ipynb b/07_dimensionality_reduction.ipynb index f80126a..ebb5ff7 100644 --- a/07_dimensionality_reduction.ipynb +++ b/07_dimensionality_reduction.ipynb @@ -125,6 +125,13 @@ "# PCA" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This chapter starts with several figures to explain the concepts of PCA and Manifold Learning. Below is the code to generate these figures. You can skip directly to the [Principal Components](#Principal-Components) section below if you want." + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -144,20 +151,13 @@ "\n", "np.random.seed(42)\n", "m = 60\n", - "w1, w2 = 0.2, 0.5\n", - "noise = 0.2\n", - "angles = np.random.rand(m) * 2 * np.pi * 0.8 + np.pi / 2\n", - "X = np.empty((m, 3))\n", - "X[:, 0] = np.cos(angles) + np.sin(angles) / 2 + noise * np.random.randn(m) / 2\n", - "X[:, 1] = np.sin(angles) * 0.7 + noise * np.random.randn(m) / 2\n", - "X[:, 2] = X[:, 0] * w1 + X[:, 1] * w2 + noise * np.random.randn(m)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This chapter starts with several figures to explain the concepts of PCA and Manifold Learning. Below is the code to generate these figures. You can skip directly to the [Principal Components](#Principal-Components) section below." + "angles = (np.random.rand(m) ** 3 + 0.5) * 2 * np.pi\n", + "X = np.zeros((m, 3))\n", + "X[:, 0] = np.cos(angles)\n", + "X[:, 1] = np.sin(angles) * 0.5\n", + "X += 0.28 * np.random.randn(m, 3)\n", + "X = rotate_3d(X, -np.pi / 4, np.pi / 30, -np.pi / 20)\n", + "X += [0.2, 0, 0.2]" ] }, { @@ -177,7 +177,9 @@ { "cell_type": "code", "execution_count": 6, - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "# not in the book\n", @@ -289,7 +291,7 @@ "source": [ "from sklearn.datasets import make_swiss_roll\n", "\n", - "X, t = make_swiss_roll(n_samples=1000, noise=0.2, random_state=42)" + "X_swiss, t = make_swiss_roll(n_samples=1000, noise=0.2, random_state=42)" ] }, { @@ -298,6 +300,8 @@ "metadata": {}, "outputs": [], "source": [ + "# not in the book\n", + "\n", "from matplotlib.colors import ListedColormap\n", "\n", "darker_hot = ListedColormap(plt.cm.hot(np.linspace(0, 0.8, 256)))\n", @@ -307,7 +311,7 @@ "fig = plt.figure(figsize=(6, 5))\n", "ax = fig.add_subplot(111, projection='3d')\n", "\n", - "ax.scatter(X[:, 0], X[:, 1], X[:, 2], c=t, cmap=darker_hot)\n", + "ax.scatter(X_swiss[:, 0], X_swiss[:, 1], X_swiss[:, 2], c=t, cmap=darker_hot)\n", "ax.view_init(10, -70)\n", "set_xyz_axes(ax, axes)\n", "save_fig(\"swiss_roll_plot\")\n", @@ -330,14 +334,14 @@ "plt.figure(figsize=(10, 4))\n", "\n", "plt.subplot(121)\n", - "plt.scatter(X[:, 0], X[:, 1], c=t, cmap=darker_hot)\n", + "plt.scatter(X_swiss[:, 0], X_swiss[:, 1], c=t, cmap=darker_hot)\n", "plt.axis(axes[:4])\n", "plt.xlabel(\"$x_1$\")\n", "plt.ylabel(\"$x_2$\", labelpad=10, rotation=0)\n", "plt.grid(True)\n", "\n", "plt.subplot(122)\n", - "plt.scatter(t, X[:, 1], c=t, cmap=darker_hot)\n", + "plt.scatter(t, X_swiss[:, 1], c=t, cmap=darker_hot)\n", "plt.axis([4, 14.8, axes[2], axes[3]])\n", "plt.xlabel(\"$z_1$\")\n", "plt.grid(True)\n", @@ -364,9 +368,9 @@ "x3s = np.linspace(axes[4], axes[5], 10)\n", "x2, x3 = np.meshgrid(x2s, x3s)\n", "\n", - "positive_class = X[:, 0] > 5\n", - "X_pos = X[positive_class]\n", - "X_neg = X[~positive_class]\n", + "positive_class = X_swiss[:, 0] > 5\n", + "X_pos = X_swiss[positive_class]\n", + "X_neg = X_swiss[~positive_class]\n", "\n", "fig = plt.figure(figsize=(6, 5))\n", "ax = plt.subplot(1, 1, 1, projection='3d')\n", @@ -380,8 +384,8 @@ "\n", "fig = plt.figure(figsize=(5, 4))\n", "ax = plt.subplot(1, 1, 1)\n", - "ax.plot(t[positive_class], X[positive_class, 1], \"gs\")\n", - "ax.plot(t[~positive_class], X[~positive_class, 1], \"y^\")\n", + "ax.plot(t[positive_class], X_swiss[positive_class, 1], \"gs\")\n", + "ax.plot(t[~positive_class], X_swiss[~positive_class, 1], \"y^\")\n", "ax.axis([4, 15, axes[2], axes[3]])\n", "ax.set_xlabel(\"$z_1$\")\n", "ax.set_ylabel(\"$z_2$\", rotation=0, labelpad=8)\n", @@ -389,9 +393,9 @@ "save_fig(\"manifold_decision_boundary_plot2\")\n", "plt.show()\n", "\n", - "positive_class = 2 * (t[:] - 4) > X[:, 1]\n", - "X_pos = X[positive_class]\n", - "X_neg = X[~positive_class]\n", + "positive_class = 2 * (t[:] - 4) > X_swiss[:, 1]\n", + "X_pos = X_swiss[positive_class]\n", + "X_neg = X_swiss[~positive_class]\n", "\n", "fig = plt.figure(figsize=(6, 5))\n", "ax = plt.subplot(1, 1, 1, projection='3d')\n", @@ -412,8 +416,8 @@ "\n", "fig = plt.figure(figsize=(5, 4))\n", "ax = plt.subplot(1, 1, 1)\n", - "ax.plot(t[positive_class], X[positive_class, 1], \"gs\")\n", - "ax.plot(t[~positive_class], X[~positive_class, 1], \"y^\")\n", + "ax.plot(t[positive_class], X_swiss[positive_class, 1], \"gs\")\n", + "ax.plot(t[~positive_class], X_swiss[~positive_class, 1], \"y^\")\n", "ax.plot([4, 15], [0, 22], \"b-\", linewidth=2)\n", "ax.axis([4, 15, axes[2], axes[3]])\n", "ax.set_xlabel(\"$z_1$\")\n", @@ -436,23 +440,25 @@ "metadata": {}, "outputs": [], "source": [ + "# not in the book\n", + "\n", "angle = np.pi / 5\n", "stretch = 5\n", "m = 200\n", "\n", "np.random.seed(3)\n", - "X = np.random.randn(m, 2) / 10\n", - "X = X @ np.array([[stretch, 0], [0, 1]]) # stretch\n", - "X = X @ [[np.cos(angle), np.sin(angle)],\n", - " [np.sin(angle), np.cos(angle)]] # rotate\n", + "X_line = np.random.randn(m, 2) / 10\n", + "X_line = X_line @ np.array([[stretch, 0], [0, 1]]) # stretch\n", + "X_line = X_line @ [[np.cos(angle), np.sin(angle)],\n", + " [np.sin(angle), np.cos(angle)]] # rotate\n", "\n", "u1 = np.array([np.cos(angle), np.sin(angle)])\n", "u2 = np.array([np.cos(angle - 2 * np.pi / 6), np.sin(angle - 2 * np.pi / 6)])\n", "u3 = np.array([np.cos(angle - np.pi / 2), np.sin(angle - np.pi / 2)])\n", "\n", - "X_proj1 = X @ u1.reshape(-1, 1)\n", - "X_proj2 = X @ u2.reshape(-1, 1)\n", - "X_proj3 = X @ u3.reshape(-1, 1)\n", + "X_proj1 = X_line @ u1.reshape(-1, 1)\n", + "X_proj2 = X_line @ u2.reshape(-1, 1)\n", + "X_proj3 = X_line @ u3.reshape(-1, 1)\n", "\n", "plt.figure(figsize=(8, 4))\n", "plt.subplot2grid((3, 2), (0, 0), rowspan=3)\n", @@ -462,7 +468,7 @@ " linewidth=2)\n", "plt.plot([-1.4, 1.4], [-1.4 * u3[1] / u3[0], 1.4 * u3[1] / u3[0]], \"k:\",\n", " linewidth=2)\n", - "plt.plot(X[:, 0], X[:, 1], \"ro\", alpha=0.5)\n", + "plt.plot(X_line[:, 0], X_line[:, 1], \"ro\", alpha=0.5)\n", "plt.arrow(0, 0, u1[0], u1[1], head_width=0.1, linewidth=4, alpha=0.9,\n", " length_includes_head=True, head_length=0.1, fc=\"b\", ec=\"b\", zorder=10)\n", "plt.arrow(0, 0, u3[0], u3[1], head_width=0.1, linewidth=1, alpha=0.9,\n", @@ -632,14 +638,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The first dimension explains about 68% of the variance, while the second explains about 28%." + "The first dimension explains about 76% of the variance, while the second explains about 15%." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "By projecting down to 2D, we lost about 4% of the variance:" + "By projecting down to 2D, we lost about 9% of the variance:" ] }, { @@ -673,7 +679,7 @@ "pca = PCA()\n", "pca.fit(X_train)\n", "cumsum = np.cumsum(pca.explained_variance_ratio_)\n", - "d = np.argmax(cumsum >= 0.95) + 1 # d == 154" + "d = np.argmax(cumsum >= 0.95) + 1 # d equals 154" ] }, { @@ -700,22 +706,13 @@ "execution_count": 23, "metadata": {}, "outputs": [], - "source": [ - "X_reduced_pca = X_reduced # not in the book (saved for comparison below)" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": {}, - "outputs": [], "source": [ "pca.n_components_" ] }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 24, "metadata": {}, "outputs": [], "source": [ @@ -731,7 +728,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 25, "metadata": {}, "outputs": [], "source": [ @@ -752,7 +749,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 26, "metadata": {}, "outputs": [], "source": [ @@ -773,7 +770,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 27, "metadata": {}, "outputs": [], "source": [ @@ -782,7 +779,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 28, "metadata": {}, "outputs": [], "source": [ @@ -797,7 +794,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 29, "metadata": {}, "outputs": [], "source": [ @@ -813,7 +810,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 30, "metadata": {}, "outputs": [], "source": [ @@ -823,7 +820,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 31, "metadata": {}, "outputs": [], "source": [ @@ -839,7 +836,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 32, "metadata": {}, "outputs": [], "source": [ @@ -868,7 +865,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 33, "metadata": {}, "outputs": [], "source": [ @@ -885,7 +882,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 34, "metadata": {}, "outputs": [], "source": [ @@ -915,7 +912,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 35, "metadata": {}, "outputs": [], "source": [ @@ -934,7 +931,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 36, "metadata": {}, "outputs": [], "source": [ @@ -960,7 +957,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 37, "metadata": {}, "outputs": [], "source": [ @@ -973,7 +970,7 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 38, "metadata": {}, "outputs": [], "source": [ @@ -984,7 +981,7 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 39, "metadata": {}, "outputs": [], "source": [ @@ -998,7 +995,7 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 40, "metadata": {}, "outputs": [], "source": [ @@ -1017,7 +1014,7 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 41, "metadata": {}, "outputs": [], "source": [ @@ -1027,7 +1024,7 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 42, "metadata": {}, "outputs": [], "source": [ @@ -1057,7 +1054,7 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 43, "metadata": {}, "outputs": [], "source": [ @@ -1078,7 +1075,7 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 44, "metadata": {}, "outputs": [], "source": [ @@ -1096,7 +1093,7 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 45, "metadata": {}, "outputs": [], "source": [ @@ -1111,7 +1108,7 @@ }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 46, "metadata": {}, "outputs": [], "source": [ @@ -1123,7 +1120,7 @@ }, { "cell_type": "code", - "execution_count": 48, + "execution_count": 47, "metadata": {}, "outputs": [], "source": [ @@ -1135,7 +1132,7 @@ }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 48, "metadata": {}, "outputs": [], "source": [ @@ -1155,7 +1152,7 @@ }, { "cell_type": "code", - "execution_count": 50, + "execution_count": 49, "metadata": {}, "outputs": [], "source": [ @@ -1186,7 +1183,7 @@ }, { "cell_type": "code", - "execution_count": 51, + "execution_count": 50, "metadata": {}, "outputs": [], "source": [ @@ -1198,7 +1195,7 @@ }, { "cell_type": "code", - "execution_count": 52, + "execution_count": 51, "metadata": {}, "outputs": [], "source": [ @@ -1271,7 +1268,7 @@ }, { "cell_type": "code", - "execution_count": 53, + "execution_count": 52, "metadata": {}, "outputs": [], "source": [ @@ -1291,7 +1288,7 @@ }, { "cell_type": "code", - "execution_count": 54, + "execution_count": 53, "metadata": {}, "outputs": [], "source": [ @@ -1300,7 +1297,7 @@ }, { "cell_type": "code", - "execution_count": 55, + "execution_count": 54, "metadata": {}, "outputs": [], "source": [ @@ -1309,7 +1306,7 @@ }, { "cell_type": "code", - "execution_count": 56, + "execution_count": 55, "metadata": {}, "outputs": [], "source": [ @@ -1328,7 +1325,7 @@ }, { "cell_type": "code", - "execution_count": 57, + "execution_count": 56, "metadata": {}, "outputs": [], "source": [ @@ -1347,7 +1344,7 @@ }, { "cell_type": "code", - "execution_count": 58, + "execution_count": 57, "metadata": {}, "outputs": [], "source": [ @@ -1371,7 +1368,7 @@ }, { "cell_type": "code", - "execution_count": 59, + "execution_count": 58, "metadata": {}, "outputs": [], "source": [ @@ -1397,7 +1394,7 @@ }, { "cell_type": "code", - "execution_count": 60, + "execution_count": 59, "metadata": {}, "outputs": [], "source": [ @@ -1409,7 +1406,7 @@ }, { "cell_type": "code", - "execution_count": 61, + "execution_count": 60, "metadata": {}, "outputs": [], "source": [ @@ -1426,7 +1423,7 @@ }, { "cell_type": "code", - "execution_count": 62, + "execution_count": 61, "metadata": {}, "outputs": [], "source": [ @@ -1443,7 +1440,7 @@ }, { "cell_type": "code", - "execution_count": 63, + "execution_count": 62, "metadata": {}, "outputs": [], "source": [ @@ -1488,7 +1485,7 @@ }, { "cell_type": "code", - "execution_count": 64, + "execution_count": 63, "metadata": {}, "outputs": [], "source": [ @@ -1504,7 +1501,7 @@ }, { "cell_type": "code", - "execution_count": 65, + "execution_count": 64, "metadata": {}, "outputs": [], "source": [ @@ -1524,7 +1521,7 @@ }, { "cell_type": "code", - "execution_count": 66, + "execution_count": 65, "metadata": {}, "outputs": [], "source": [ @@ -1552,7 +1549,7 @@ }, { "cell_type": "code", - "execution_count": 67, + "execution_count": 66, "metadata": {}, "outputs": [], "source": [ @@ -1574,7 +1571,7 @@ }, { "cell_type": "code", - "execution_count": 68, + "execution_count": 67, "metadata": {}, "outputs": [], "source": [ @@ -1589,7 +1586,7 @@ }, { "cell_type": "code", - "execution_count": 69, + "execution_count": 68, "metadata": {}, "outputs": [], "source": [ @@ -1625,7 +1622,7 @@ }, { "cell_type": "code", - "execution_count": 70, + "execution_count": 69, "metadata": {}, "outputs": [], "source": [ @@ -1672,7 +1669,7 @@ }, { "cell_type": "code", - "execution_count": 71, + "execution_count": 70, "metadata": {}, "outputs": [], "source": [ @@ -1688,7 +1685,7 @@ }, { "cell_type": "code", - "execution_count": 72, + "execution_count": 71, "metadata": {}, "outputs": [], "source": [ @@ -1704,7 +1701,7 @@ }, { "cell_type": "code", - "execution_count": 73, + "execution_count": 72, "metadata": {}, "outputs": [], "source": [ @@ -1734,11 +1731,12 @@ }, { "cell_type": "code", - "execution_count": 74, + "execution_count": 73, "metadata": {}, "outputs": [], "source": [ - "%time X_pca_reduced = PCA(n_components=2).fit_transform(X_sample)\n", + "pca = PCA(n_components=2, random_state=42)\n", + "%time X_pca_reduced = pca.fit_transform(X_sample)\n", "plot_digits(X_pca_reduced, y_sample)\n", "plt.show()" ] @@ -1752,7 +1750,7 @@ }, { "cell_type": "code", - "execution_count": 75, + "execution_count": 74, "metadata": {}, "outputs": [], "source": [ @@ -1771,7 +1769,7 @@ }, { "cell_type": "code", - "execution_count": 76, + "execution_count": 75, "metadata": {}, "outputs": [], "source": [ @@ -1780,7 +1778,7 @@ "\n", "%time X_pca_lle_reduced = pca_lle.fit_transform(X_sample)\n", "plot_digits(X_pca_lle_reduced, y_sample)\n", - "plt.show()tight_layout=" + "plt.show()" ] }, { @@ -1801,12 +1799,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "**Warning**: the following cell will take about 10-15 minutes to run, depending on your hardware:" + "**Warning**: the following cell will take about 10 minutes to run, depending on your hardware:" ] }, { "cell_type": "code", - "execution_count": 77, + "execution_count": 76, "metadata": {}, "outputs": [], "source": [ @@ -1826,12 +1824,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "**Warning**: the following cell will take about 10-15 minutes to run, depending on your hardware:" + "**Warning**: the following cell will take about 10 minutes to run, depending on your hardware:" ] }, { "cell_type": "code", - "execution_count": 78, + "execution_count": 77, "metadata": {}, "outputs": [], "source": [ @@ -1859,7 +1857,7 @@ }, { "cell_type": "code", - "execution_count": 79, + "execution_count": 78, "metadata": {}, "outputs": [], "source": [