Improve 3D dataset and add missing random_state=42 for PCA in exercises

2021-11-19 23:36:04 +13:00 · 2021-11-19 23:36:04 +13:00 · 3e7fe8e2ff
parent 6a4660de6f
commit 3e7fe8e2ff
1 changed files with 106 additions and 108 deletions
--- a/07_dimensionality_reduction.ipynb
+++ b/07_dimensionality_reduction.ipynb
@ -125,6 +125,13 @@
    "# PCA"
   ]
  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This chapter starts with several figures to explain the concepts of PCA and Manifold Learning. Below is the code to generate these figures. You can skip directly to the [Principal Components](#Principal-Components) section below if you want."
+   ]
+  },
  {
   "cell_type": "markdown",
   "metadata": {},
@ -144,20 +151,13 @@
    "\n",
    "np.random.seed(42)\n",
    "m = 60\n",
-    "w1, w2 = 0.2, 0.5\n",
-    "noise = 0.2\n",
-    "angles = np.random.rand(m) * 2 * np.pi * 0.8 + np.pi / 2\n",
-    "X = np.empty((m, 3))\n",
-    "X[:, 0] = np.cos(angles) + np.sin(angles) / 2 + noise * np.random.randn(m) / 2\n",
-    "X[:, 1] = np.sin(angles) * 0.7 + noise * np.random.randn(m) / 2\n",
-    "X[:, 2] = X[:, 0] * w1 + X[:, 1] * w2 + noise * np.random.randn(m)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "This chapter starts with several figures to explain the concepts of PCA and Manifold Learning. Below is the code to generate these figures. You can skip directly to the [Principal Components](#Principal-Components) section below."
+    "angles = (np.random.rand(m) ** 3 + 0.5) * 2 * np.pi\n",
+    "X = np.zeros((m, 3))\n",
+    "X[:, 0] = np.cos(angles)\n",
+    "X[:, 1] = np.sin(angles) * 0.5\n",
+    "X += 0.28 * np.random.randn(m, 3)\n",
+    "X = rotate_3d(X, -np.pi / 4, np.pi / 30, -np.pi / 20)\n",
+    "X += [0.2, 0, 0.2]"
   ]
  },
  {
@ -177,7 +177,9 @@
  {
   "cell_type": "code",
   "execution_count": 6,
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
   "outputs": [],
   "source": [
    "# not in the book\n",
@ -289,7 +291,7 @@
   "source": [
    "from sklearn.datasets import make_swiss_roll\n",
    "\n",
-    "X, t = make_swiss_roll(n_samples=1000, noise=0.2, random_state=42)"
+    "X_swiss, t = make_swiss_roll(n_samples=1000, noise=0.2, random_state=42)"
   ]
  },
  {
@ -298,6 +300,8 @@
   "metadata": {},
   "outputs": [],
   "source": [
+    "# not in the book\n",
+    "\n",
    "from matplotlib.colors import ListedColormap\n",
    "\n",
    "darker_hot = ListedColormap(plt.cm.hot(np.linspace(0, 0.8, 256)))\n",
@ -307,7 +311,7 @@
    "fig = plt.figure(figsize=(6, 5))\n",
    "ax = fig.add_subplot(111, projection='3d')\n",
    "\n",
-    "ax.scatter(X[:, 0], X[:, 1], X[:, 2], c=t, cmap=darker_hot)\n",
+    "ax.scatter(X_swiss[:, 0], X_swiss[:, 1], X_swiss[:, 2], c=t, cmap=darker_hot)\n",
    "ax.view_init(10, -70)\n",
    "set_xyz_axes(ax, axes)\n",
    "save_fig(\"swiss_roll_plot\")\n",
@ -330,14 +334,14 @@
    "plt.figure(figsize=(10, 4))\n",
    "\n",
    "plt.subplot(121)\n",
-    "plt.scatter(X[:, 0], X[:, 1], c=t, cmap=darker_hot)\n",
+    "plt.scatter(X_swiss[:, 0], X_swiss[:, 1], c=t, cmap=darker_hot)\n",
    "plt.axis(axes[:4])\n",
    "plt.xlabel(\"$x_1$\")\n",
    "plt.ylabel(\"$x_2$\", labelpad=10, rotation=0)\n",
    "plt.grid(True)\n",
    "\n",
    "plt.subplot(122)\n",
-    "plt.scatter(t, X[:, 1], c=t, cmap=darker_hot)\n",
+    "plt.scatter(t, X_swiss[:, 1], c=t, cmap=darker_hot)\n",
    "plt.axis([4, 14.8, axes[2], axes[3]])\n",
    "plt.xlabel(\"$z_1$\")\n",
    "plt.grid(True)\n",
@ -364,9 +368,9 @@
    "x3s = np.linspace(axes[4], axes[5], 10)\n",
    "x2, x3 = np.meshgrid(x2s, x3s)\n",
    "\n",
-    "positive_class = X[:, 0] > 5\n",
-    "X_pos = X[positive_class]\n",
-    "X_neg = X[~positive_class]\n",
+    "positive_class = X_swiss[:, 0] > 5\n",
+    "X_pos = X_swiss[positive_class]\n",
+    "X_neg = X_swiss[~positive_class]\n",
    "\n",
    "fig = plt.figure(figsize=(6, 5))\n",
    "ax = plt.subplot(1, 1, 1, projection='3d')\n",
@ -380,8 +384,8 @@
    "\n",
    "fig = plt.figure(figsize=(5, 4))\n",
    "ax = plt.subplot(1, 1, 1)\n",
-    "ax.plot(t[positive_class], X[positive_class, 1], \"gs\")\n",
-    "ax.plot(t[~positive_class], X[~positive_class, 1], \"y^\")\n",
+    "ax.plot(t[positive_class], X_swiss[positive_class, 1], \"gs\")\n",
+    "ax.plot(t[~positive_class], X_swiss[~positive_class, 1], \"y^\")\n",
    "ax.axis([4, 15, axes[2], axes[3]])\n",
    "ax.set_xlabel(\"$z_1$\")\n",
    "ax.set_ylabel(\"$z_2$\", rotation=0, labelpad=8)\n",
@ -389,9 +393,9 @@
    "save_fig(\"manifold_decision_boundary_plot2\")\n",
    "plt.show()\n",
    "\n",
-    "positive_class = 2 * (t[:] - 4) > X[:, 1]\n",
-    "X_pos = X[positive_class]\n",
-    "X_neg = X[~positive_class]\n",
+    "positive_class = 2 * (t[:] - 4) > X_swiss[:, 1]\n",
+    "X_pos = X_swiss[positive_class]\n",
+    "X_neg = X_swiss[~positive_class]\n",
    "\n",
    "fig = plt.figure(figsize=(6, 5))\n",
    "ax = plt.subplot(1, 1, 1, projection='3d')\n",
@ -412,8 +416,8 @@
    "\n",
    "fig = plt.figure(figsize=(5, 4))\n",
    "ax = plt.subplot(1, 1, 1)\n",
-    "ax.plot(t[positive_class], X[positive_class, 1], \"gs\")\n",
-    "ax.plot(t[~positive_class], X[~positive_class, 1], \"y^\")\n",
+    "ax.plot(t[positive_class], X_swiss[positive_class, 1], \"gs\")\n",
+    "ax.plot(t[~positive_class], X_swiss[~positive_class, 1], \"y^\")\n",
    "ax.plot([4, 15], [0, 22], \"b-\", linewidth=2)\n",
    "ax.axis([4, 15, axes[2], axes[3]])\n",
    "ax.set_xlabel(\"$z_1$\")\n",
@ -436,23 +440,25 @@
   "metadata": {},
   "outputs": [],
   "source": [
+    "# not in the book\n",
+    "\n",
    "angle = np.pi / 5\n",
    "stretch = 5\n",
    "m = 200\n",
    "\n",
    "np.random.seed(3)\n",
-    "X = np.random.randn(m, 2) / 10\n",
-    "X = X @ np.array([[stretch, 0], [0, 1]])  # stretch\n",
-    "X = X @ [[np.cos(angle), np.sin(angle)],\n",
-    "         [np.sin(angle), np.cos(angle)]]  # rotate\n",
+    "X_line = np.random.randn(m, 2) / 10\n",
+    "X_line = X_line @ np.array([[stretch, 0], [0, 1]])  # stretch\n",
+    "X_line = X_line @ [[np.cos(angle), np.sin(angle)],\n",
+    "                   [np.sin(angle), np.cos(angle)]]  # rotate\n",
    "\n",
    "u1 = np.array([np.cos(angle), np.sin(angle)])\n",
    "u2 = np.array([np.cos(angle - 2 * np.pi / 6), np.sin(angle - 2 * np.pi / 6)])\n",
    "u3 = np.array([np.cos(angle - np.pi / 2), np.sin(angle - np.pi / 2)])\n",
    "\n",
-    "X_proj1 = X @ u1.reshape(-1, 1)\n",
-    "X_proj2 = X @ u2.reshape(-1, 1)\n",
-    "X_proj3 = X @ u3.reshape(-1, 1)\n",
+    "X_proj1 = X_line @ u1.reshape(-1, 1)\n",
+    "X_proj2 = X_line @ u2.reshape(-1, 1)\n",
+    "X_proj3 = X_line @ u3.reshape(-1, 1)\n",
    "\n",
    "plt.figure(figsize=(8, 4))\n",
    "plt.subplot2grid((3, 2), (0, 0), rowspan=3)\n",
@ -462,7 +468,7 @@
    "         linewidth=2)\n",
    "plt.plot([-1.4, 1.4], [-1.4 * u3[1] / u3[0], 1.4 * u3[1] / u3[0]], \"k:\",\n",
    "         linewidth=2)\n",
-    "plt.plot(X[:, 0], X[:, 1], \"ro\", alpha=0.5)\n",
+    "plt.plot(X_line[:, 0], X_line[:, 1], \"ro\", alpha=0.5)\n",
    "plt.arrow(0, 0, u1[0], u1[1], head_width=0.1, linewidth=4, alpha=0.9,\n",
    "          length_includes_head=True, head_length=0.1, fc=\"b\", ec=\"b\", zorder=10)\n",
    "plt.arrow(0, 0, u3[0], u3[1], head_width=0.1, linewidth=1, alpha=0.9,\n",
@ -632,14 +638,14 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "The first dimension explains about 68% of the variance, while the second explains about 28%."
+    "The first dimension explains about 76% of the variance, while the second explains about 15%."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "By projecting down to 2D, we lost about 4% of the variance:"
+    "By projecting down to 2D, we lost about 9% of the variance:"
   ]
  },
  {
@ -673,7 +679,7 @@
    "pca = PCA()\n",
    "pca.fit(X_train)\n",
    "cumsum = np.cumsum(pca.explained_variance_ratio_)\n",
-    "d = np.argmax(cumsum >= 0.95) + 1  # d == 154"
+    "d = np.argmax(cumsum >= 0.95) + 1  # d equals 154"
   ]
  },
  {
@ -700,22 +706,13 @@
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
-   "source": [
-    "X_reduced_pca = X_reduced  # not in the book (saved for comparison below)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 24,
-   "metadata": {},
-   "outputs": [],
   "source": [
    "pca.n_components_"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
@ -731,7 +728,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 26,
+   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
@ -752,7 +749,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 27,
+   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
@ -773,7 +770,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 28,
+   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
@ -782,7 +779,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 29,
+   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
@ -797,7 +794,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 30,
+   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
@ -813,7 +810,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 31,
+   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
@ -823,7 +820,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 32,
+   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
@ -839,7 +836,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 33,
+   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
@ -868,7 +865,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 34,
+   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
@ -885,7 +882,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 35,
+   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
@ -915,7 +912,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 36,
+   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
@ -934,7 +931,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 37,
+   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
@ -960,7 +957,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 38,
+   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
@ -973,7 +970,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 39,
+   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
@ -984,7 +981,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 40,
+   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
@ -998,7 +995,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 41,
+   "execution_count": 40,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1017,7 +1014,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 42,
+   "execution_count": 41,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1027,7 +1024,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 43,
+   "execution_count": 42,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1057,7 +1054,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 44,
+   "execution_count": 43,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1078,7 +1075,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 45,
+   "execution_count": 44,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1096,7 +1093,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 46,
+   "execution_count": 45,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1111,7 +1108,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 47,
+   "execution_count": 46,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1123,7 +1120,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 48,
+   "execution_count": 47,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1135,7 +1132,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 49,
+   "execution_count": 48,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1155,7 +1152,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 50,
+   "execution_count": 49,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1186,7 +1183,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 51,
+   "execution_count": 50,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1198,7 +1195,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 52,
+   "execution_count": 51,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1271,7 +1268,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 53,
+   "execution_count": 52,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1291,7 +1288,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 54,
+   "execution_count": 53,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1300,7 +1297,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 55,
+   "execution_count": 54,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1309,7 +1306,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 56,
+   "execution_count": 55,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1328,7 +1325,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 57,
+   "execution_count": 56,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1347,7 +1344,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 58,
+   "execution_count": 57,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1371,7 +1368,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 59,
+   "execution_count": 58,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1397,7 +1394,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 60,
+   "execution_count": 59,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1409,7 +1406,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 61,
+   "execution_count": 60,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1426,7 +1423,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 62,
+   "execution_count": 61,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1443,7 +1440,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 63,
+   "execution_count": 62,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1488,7 +1485,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 64,
+   "execution_count": 63,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1504,7 +1501,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 65,
+   "execution_count": 64,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1524,7 +1521,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 66,
+   "execution_count": 65,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1552,7 +1549,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 67,
+   "execution_count": 66,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1574,7 +1571,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 68,
+   "execution_count": 67,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1589,7 +1586,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 69,
+   "execution_count": 68,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1625,7 +1622,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 70,
+   "execution_count": 69,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1672,7 +1669,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 71,
+   "execution_count": 70,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1688,7 +1685,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 72,
+   "execution_count": 71,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1704,7 +1701,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 73,
+   "execution_count": 72,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1734,11 +1731,12 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 74,
+   "execution_count": 73,
   "metadata": {},
   "outputs": [],
   "source": [
-    "%time X_pca_reduced = PCA(n_components=2).fit_transform(X_sample)\n",
+    "pca = PCA(n_components=2, random_state=42)\n",
+    "%time X_pca_reduced = pca.fit_transform(X_sample)\n",
    "plot_digits(X_pca_reduced, y_sample)\n",
    "plt.show()"
   ]
@ -1752,7 +1750,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 75,
+   "execution_count": 74,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1771,7 +1769,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 76,
+   "execution_count": 75,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1780,7 +1778,7 @@
    "\n",
    "%time X_pca_lle_reduced = pca_lle.fit_transform(X_sample)\n",
    "plot_digits(X_pca_lle_reduced, y_sample)\n",
-    "plt.show()tight_layout="
+    "plt.show()"
   ]
  },
  {
@ -1801,12 +1799,12 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "**Warning**: the following cell will take about 10-15 minutes to run, depending on your hardware:"
+    "**Warning**: the following cell will take about 10 minutes to run, depending on your hardware:"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 77,
+   "execution_count": 76,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1826,12 +1824,12 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "**Warning**: the following cell will take about 10-15 minutes to run, depending on your hardware:"
+    "**Warning**: the following cell will take about 10 minutes to run, depending on your hardware:"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 78,
+   "execution_count": 77,
   "metadata": {},
   "outputs": [],
   "source": [
@ -1859,7 +1857,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 79,
+   "execution_count": 78,
   "metadata": {},
   "outputs": [],
   "source": [