Improve 3D dataset and add missing random_state=42 for PCA in exercises

main
Aurélien Geron 2021-11-19 23:36:04 +13:00
parent 6a4660de6f
commit 3e7fe8e2ff
1 changed files with 106 additions and 108 deletions

View File

@ -125,6 +125,13 @@
"# PCA"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This chapter starts with several figures to explain the concepts of PCA and Manifold Learning. Below is the code to generate these figures. You can skip directly to the [Principal Components](#Principal-Components) section below if you want."
]
},
{
"cell_type": "markdown",
"metadata": {},
@ -144,20 +151,13 @@
"\n",
"np.random.seed(42)\n",
"m = 60\n",
"w1, w2 = 0.2, 0.5\n",
"noise = 0.2\n",
"angles = np.random.rand(m) * 2 * np.pi * 0.8 + np.pi / 2\n",
"X = np.empty((m, 3))\n",
"X[:, 0] = np.cos(angles) + np.sin(angles) / 2 + noise * np.random.randn(m) / 2\n",
"X[:, 1] = np.sin(angles) * 0.7 + noise * np.random.randn(m) / 2\n",
"X[:, 2] = X[:, 0] * w1 + X[:, 1] * w2 + noise * np.random.randn(m)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This chapter starts with several figures to explain the concepts of PCA and Manifold Learning. Below is the code to generate these figures. You can skip directly to the [Principal Components](#Principal-Components) section below."
"angles = (np.random.rand(m) ** 3 + 0.5) * 2 * np.pi\n",
"X = np.zeros((m, 3))\n",
"X[:, 0] = np.cos(angles)\n",
"X[:, 1] = np.sin(angles) * 0.5\n",
"X += 0.28 * np.random.randn(m, 3)\n",
"X = rotate_3d(X, -np.pi / 4, np.pi / 30, -np.pi / 20)\n",
"X += [0.2, 0, 0.2]"
]
},
{
@ -177,7 +177,9 @@
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"# not in the book\n",
@ -289,7 +291,7 @@
"source": [
"from sklearn.datasets import make_swiss_roll\n",
"\n",
"X, t = make_swiss_roll(n_samples=1000, noise=0.2, random_state=42)"
"X_swiss, t = make_swiss_roll(n_samples=1000, noise=0.2, random_state=42)"
]
},
{
@ -298,6 +300,8 @@
"metadata": {},
"outputs": [],
"source": [
"# not in the book\n",
"\n",
"from matplotlib.colors import ListedColormap\n",
"\n",
"darker_hot = ListedColormap(plt.cm.hot(np.linspace(0, 0.8, 256)))\n",
@ -307,7 +311,7 @@
"fig = plt.figure(figsize=(6, 5))\n",
"ax = fig.add_subplot(111, projection='3d')\n",
"\n",
"ax.scatter(X[:, 0], X[:, 1], X[:, 2], c=t, cmap=darker_hot)\n",
"ax.scatter(X_swiss[:, 0], X_swiss[:, 1], X_swiss[:, 2], c=t, cmap=darker_hot)\n",
"ax.view_init(10, -70)\n",
"set_xyz_axes(ax, axes)\n",
"save_fig(\"swiss_roll_plot\")\n",
@ -330,14 +334,14 @@
"plt.figure(figsize=(10, 4))\n",
"\n",
"plt.subplot(121)\n",
"plt.scatter(X[:, 0], X[:, 1], c=t, cmap=darker_hot)\n",
"plt.scatter(X_swiss[:, 0], X_swiss[:, 1], c=t, cmap=darker_hot)\n",
"plt.axis(axes[:4])\n",
"plt.xlabel(\"$x_1$\")\n",
"plt.ylabel(\"$x_2$\", labelpad=10, rotation=0)\n",
"plt.grid(True)\n",
"\n",
"plt.subplot(122)\n",
"plt.scatter(t, X[:, 1], c=t, cmap=darker_hot)\n",
"plt.scatter(t, X_swiss[:, 1], c=t, cmap=darker_hot)\n",
"plt.axis([4, 14.8, axes[2], axes[3]])\n",
"plt.xlabel(\"$z_1$\")\n",
"plt.grid(True)\n",
@ -364,9 +368,9 @@
"x3s = np.linspace(axes[4], axes[5], 10)\n",
"x2, x3 = np.meshgrid(x2s, x3s)\n",
"\n",
"positive_class = X[:, 0] > 5\n",
"X_pos = X[positive_class]\n",
"X_neg = X[~positive_class]\n",
"positive_class = X_swiss[:, 0] > 5\n",
"X_pos = X_swiss[positive_class]\n",
"X_neg = X_swiss[~positive_class]\n",
"\n",
"fig = plt.figure(figsize=(6, 5))\n",
"ax = plt.subplot(1, 1, 1, projection='3d')\n",
@ -380,8 +384,8 @@
"\n",
"fig = plt.figure(figsize=(5, 4))\n",
"ax = plt.subplot(1, 1, 1)\n",
"ax.plot(t[positive_class], X[positive_class, 1], \"gs\")\n",
"ax.plot(t[~positive_class], X[~positive_class, 1], \"y^\")\n",
"ax.plot(t[positive_class], X_swiss[positive_class, 1], \"gs\")\n",
"ax.plot(t[~positive_class], X_swiss[~positive_class, 1], \"y^\")\n",
"ax.axis([4, 15, axes[2], axes[3]])\n",
"ax.set_xlabel(\"$z_1$\")\n",
"ax.set_ylabel(\"$z_2$\", rotation=0, labelpad=8)\n",
@ -389,9 +393,9 @@
"save_fig(\"manifold_decision_boundary_plot2\")\n",
"plt.show()\n",
"\n",
"positive_class = 2 * (t[:] - 4) > X[:, 1]\n",
"X_pos = X[positive_class]\n",
"X_neg = X[~positive_class]\n",
"positive_class = 2 * (t[:] - 4) > X_swiss[:, 1]\n",
"X_pos = X_swiss[positive_class]\n",
"X_neg = X_swiss[~positive_class]\n",
"\n",
"fig = plt.figure(figsize=(6, 5))\n",
"ax = plt.subplot(1, 1, 1, projection='3d')\n",
@ -412,8 +416,8 @@
"\n",
"fig = plt.figure(figsize=(5, 4))\n",
"ax = plt.subplot(1, 1, 1)\n",
"ax.plot(t[positive_class], X[positive_class, 1], \"gs\")\n",
"ax.plot(t[~positive_class], X[~positive_class, 1], \"y^\")\n",
"ax.plot(t[positive_class], X_swiss[positive_class, 1], \"gs\")\n",
"ax.plot(t[~positive_class], X_swiss[~positive_class, 1], \"y^\")\n",
"ax.plot([4, 15], [0, 22], \"b-\", linewidth=2)\n",
"ax.axis([4, 15, axes[2], axes[3]])\n",
"ax.set_xlabel(\"$z_1$\")\n",
@ -436,23 +440,25 @@
"metadata": {},
"outputs": [],
"source": [
"# not in the book\n",
"\n",
"angle = np.pi / 5\n",
"stretch = 5\n",
"m = 200\n",
"\n",
"np.random.seed(3)\n",
"X = np.random.randn(m, 2) / 10\n",
"X = X @ np.array([[stretch, 0], [0, 1]]) # stretch\n",
"X = X @ [[np.cos(angle), np.sin(angle)],\n",
" [np.sin(angle), np.cos(angle)]] # rotate\n",
"X_line = np.random.randn(m, 2) / 10\n",
"X_line = X_line @ np.array([[stretch, 0], [0, 1]]) # stretch\n",
"X_line = X_line @ [[np.cos(angle), np.sin(angle)],\n",
" [np.sin(angle), np.cos(angle)]] # rotate\n",
"\n",
"u1 = np.array([np.cos(angle), np.sin(angle)])\n",
"u2 = np.array([np.cos(angle - 2 * np.pi / 6), np.sin(angle - 2 * np.pi / 6)])\n",
"u3 = np.array([np.cos(angle - np.pi / 2), np.sin(angle - np.pi / 2)])\n",
"\n",
"X_proj1 = X @ u1.reshape(-1, 1)\n",
"X_proj2 = X @ u2.reshape(-1, 1)\n",
"X_proj3 = X @ u3.reshape(-1, 1)\n",
"X_proj1 = X_line @ u1.reshape(-1, 1)\n",
"X_proj2 = X_line @ u2.reshape(-1, 1)\n",
"X_proj3 = X_line @ u3.reshape(-1, 1)\n",
"\n",
"plt.figure(figsize=(8, 4))\n",
"plt.subplot2grid((3, 2), (0, 0), rowspan=3)\n",
@ -462,7 +468,7 @@
" linewidth=2)\n",
"plt.plot([-1.4, 1.4], [-1.4 * u3[1] / u3[0], 1.4 * u3[1] / u3[0]], \"k:\",\n",
" linewidth=2)\n",
"plt.plot(X[:, 0], X[:, 1], \"ro\", alpha=0.5)\n",
"plt.plot(X_line[:, 0], X_line[:, 1], \"ro\", alpha=0.5)\n",
"plt.arrow(0, 0, u1[0], u1[1], head_width=0.1, linewidth=4, alpha=0.9,\n",
" length_includes_head=True, head_length=0.1, fc=\"b\", ec=\"b\", zorder=10)\n",
"plt.arrow(0, 0, u3[0], u3[1], head_width=0.1, linewidth=1, alpha=0.9,\n",
@ -632,14 +638,14 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"The first dimension explains about 68% of the variance, while the second explains about 28%."
"The first dimension explains about 76% of the variance, while the second explains about 15%."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"By projecting down to 2D, we lost about 4% of the variance:"
"By projecting down to 2D, we lost about 9% of the variance:"
]
},
{
@ -673,7 +679,7 @@
"pca = PCA()\n",
"pca.fit(X_train)\n",
"cumsum = np.cumsum(pca.explained_variance_ratio_)\n",
"d = np.argmax(cumsum >= 0.95) + 1 # d == 154"
"d = np.argmax(cumsum >= 0.95) + 1 # d equals 154"
]
},
{
@ -700,22 +706,13 @@
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"X_reduced_pca = X_reduced # not in the book (saved for comparison below)"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"pca.n_components_"
]
},
{
"cell_type": "code",
"execution_count": 25,
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
@ -731,7 +728,7 @@
},
{
"cell_type": "code",
"execution_count": 26,
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
@ -752,7 +749,7 @@
},
{
"cell_type": "code",
"execution_count": 27,
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
@ -773,7 +770,7 @@
},
{
"cell_type": "code",
"execution_count": 28,
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
@ -782,7 +779,7 @@
},
{
"cell_type": "code",
"execution_count": 29,
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
@ -797,7 +794,7 @@
},
{
"cell_type": "code",
"execution_count": 30,
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
@ -813,7 +810,7 @@
},
{
"cell_type": "code",
"execution_count": 31,
"execution_count": 30,
"metadata": {},
"outputs": [],
"source": [
@ -823,7 +820,7 @@
},
{
"cell_type": "code",
"execution_count": 32,
"execution_count": 31,
"metadata": {},
"outputs": [],
"source": [
@ -839,7 +836,7 @@
},
{
"cell_type": "code",
"execution_count": 33,
"execution_count": 32,
"metadata": {},
"outputs": [],
"source": [
@ -868,7 +865,7 @@
},
{
"cell_type": "code",
"execution_count": 34,
"execution_count": 33,
"metadata": {},
"outputs": [],
"source": [
@ -885,7 +882,7 @@
},
{
"cell_type": "code",
"execution_count": 35,
"execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
@ -915,7 +912,7 @@
},
{
"cell_type": "code",
"execution_count": 36,
"execution_count": 35,
"metadata": {},
"outputs": [],
"source": [
@ -934,7 +931,7 @@
},
{
"cell_type": "code",
"execution_count": 37,
"execution_count": 36,
"metadata": {},
"outputs": [],
"source": [
@ -960,7 +957,7 @@
},
{
"cell_type": "code",
"execution_count": 38,
"execution_count": 37,
"metadata": {},
"outputs": [],
"source": [
@ -973,7 +970,7 @@
},
{
"cell_type": "code",
"execution_count": 39,
"execution_count": 38,
"metadata": {},
"outputs": [],
"source": [
@ -984,7 +981,7 @@
},
{
"cell_type": "code",
"execution_count": 40,
"execution_count": 39,
"metadata": {},
"outputs": [],
"source": [
@ -998,7 +995,7 @@
},
{
"cell_type": "code",
"execution_count": 41,
"execution_count": 40,
"metadata": {},
"outputs": [],
"source": [
@ -1017,7 +1014,7 @@
},
{
"cell_type": "code",
"execution_count": 42,
"execution_count": 41,
"metadata": {},
"outputs": [],
"source": [
@ -1027,7 +1024,7 @@
},
{
"cell_type": "code",
"execution_count": 43,
"execution_count": 42,
"metadata": {},
"outputs": [],
"source": [
@ -1057,7 +1054,7 @@
},
{
"cell_type": "code",
"execution_count": 44,
"execution_count": 43,
"metadata": {},
"outputs": [],
"source": [
@ -1078,7 +1075,7 @@
},
{
"cell_type": "code",
"execution_count": 45,
"execution_count": 44,
"metadata": {},
"outputs": [],
"source": [
@ -1096,7 +1093,7 @@
},
{
"cell_type": "code",
"execution_count": 46,
"execution_count": 45,
"metadata": {},
"outputs": [],
"source": [
@ -1111,7 +1108,7 @@
},
{
"cell_type": "code",
"execution_count": 47,
"execution_count": 46,
"metadata": {},
"outputs": [],
"source": [
@ -1123,7 +1120,7 @@
},
{
"cell_type": "code",
"execution_count": 48,
"execution_count": 47,
"metadata": {},
"outputs": [],
"source": [
@ -1135,7 +1132,7 @@
},
{
"cell_type": "code",
"execution_count": 49,
"execution_count": 48,
"metadata": {},
"outputs": [],
"source": [
@ -1155,7 +1152,7 @@
},
{
"cell_type": "code",
"execution_count": 50,
"execution_count": 49,
"metadata": {},
"outputs": [],
"source": [
@ -1186,7 +1183,7 @@
},
{
"cell_type": "code",
"execution_count": 51,
"execution_count": 50,
"metadata": {},
"outputs": [],
"source": [
@ -1198,7 +1195,7 @@
},
{
"cell_type": "code",
"execution_count": 52,
"execution_count": 51,
"metadata": {},
"outputs": [],
"source": [
@ -1271,7 +1268,7 @@
},
{
"cell_type": "code",
"execution_count": 53,
"execution_count": 52,
"metadata": {},
"outputs": [],
"source": [
@ -1291,7 +1288,7 @@
},
{
"cell_type": "code",
"execution_count": 54,
"execution_count": 53,
"metadata": {},
"outputs": [],
"source": [
@ -1300,7 +1297,7 @@
},
{
"cell_type": "code",
"execution_count": 55,
"execution_count": 54,
"metadata": {},
"outputs": [],
"source": [
@ -1309,7 +1306,7 @@
},
{
"cell_type": "code",
"execution_count": 56,
"execution_count": 55,
"metadata": {},
"outputs": [],
"source": [
@ -1328,7 +1325,7 @@
},
{
"cell_type": "code",
"execution_count": 57,
"execution_count": 56,
"metadata": {},
"outputs": [],
"source": [
@ -1347,7 +1344,7 @@
},
{
"cell_type": "code",
"execution_count": 58,
"execution_count": 57,
"metadata": {},
"outputs": [],
"source": [
@ -1371,7 +1368,7 @@
},
{
"cell_type": "code",
"execution_count": 59,
"execution_count": 58,
"metadata": {},
"outputs": [],
"source": [
@ -1397,7 +1394,7 @@
},
{
"cell_type": "code",
"execution_count": 60,
"execution_count": 59,
"metadata": {},
"outputs": [],
"source": [
@ -1409,7 +1406,7 @@
},
{
"cell_type": "code",
"execution_count": 61,
"execution_count": 60,
"metadata": {},
"outputs": [],
"source": [
@ -1426,7 +1423,7 @@
},
{
"cell_type": "code",
"execution_count": 62,
"execution_count": 61,
"metadata": {},
"outputs": [],
"source": [
@ -1443,7 +1440,7 @@
},
{
"cell_type": "code",
"execution_count": 63,
"execution_count": 62,
"metadata": {},
"outputs": [],
"source": [
@ -1488,7 +1485,7 @@
},
{
"cell_type": "code",
"execution_count": 64,
"execution_count": 63,
"metadata": {},
"outputs": [],
"source": [
@ -1504,7 +1501,7 @@
},
{
"cell_type": "code",
"execution_count": 65,
"execution_count": 64,
"metadata": {},
"outputs": [],
"source": [
@ -1524,7 +1521,7 @@
},
{
"cell_type": "code",
"execution_count": 66,
"execution_count": 65,
"metadata": {},
"outputs": [],
"source": [
@ -1552,7 +1549,7 @@
},
{
"cell_type": "code",
"execution_count": 67,
"execution_count": 66,
"metadata": {},
"outputs": [],
"source": [
@ -1574,7 +1571,7 @@
},
{
"cell_type": "code",
"execution_count": 68,
"execution_count": 67,
"metadata": {},
"outputs": [],
"source": [
@ -1589,7 +1586,7 @@
},
{
"cell_type": "code",
"execution_count": 69,
"execution_count": 68,
"metadata": {},
"outputs": [],
"source": [
@ -1625,7 +1622,7 @@
},
{
"cell_type": "code",
"execution_count": 70,
"execution_count": 69,
"metadata": {},
"outputs": [],
"source": [
@ -1672,7 +1669,7 @@
},
{
"cell_type": "code",
"execution_count": 71,
"execution_count": 70,
"metadata": {},
"outputs": [],
"source": [
@ -1688,7 +1685,7 @@
},
{
"cell_type": "code",
"execution_count": 72,
"execution_count": 71,
"metadata": {},
"outputs": [],
"source": [
@ -1704,7 +1701,7 @@
},
{
"cell_type": "code",
"execution_count": 73,
"execution_count": 72,
"metadata": {},
"outputs": [],
"source": [
@ -1734,11 +1731,12 @@
},
{
"cell_type": "code",
"execution_count": 74,
"execution_count": 73,
"metadata": {},
"outputs": [],
"source": [
"%time X_pca_reduced = PCA(n_components=2).fit_transform(X_sample)\n",
"pca = PCA(n_components=2, random_state=42)\n",
"%time X_pca_reduced = pca.fit_transform(X_sample)\n",
"plot_digits(X_pca_reduced, y_sample)\n",
"plt.show()"
]
@ -1752,7 +1750,7 @@
},
{
"cell_type": "code",
"execution_count": 75,
"execution_count": 74,
"metadata": {},
"outputs": [],
"source": [
@ -1771,7 +1769,7 @@
},
{
"cell_type": "code",
"execution_count": 76,
"execution_count": 75,
"metadata": {},
"outputs": [],
"source": [
@ -1780,7 +1778,7 @@
"\n",
"%time X_pca_lle_reduced = pca_lle.fit_transform(X_sample)\n",
"plot_digits(X_pca_lle_reduced, y_sample)\n",
"plt.show()tight_layout="
"plt.show()"
]
},
{
@ -1801,12 +1799,12 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"**Warning**: the following cell will take about 10-15 minutes to run, depending on your hardware:"
"**Warning**: the following cell will take about 10 minutes to run, depending on your hardware:"
]
},
{
"cell_type": "code",
"execution_count": 77,
"execution_count": 76,
"metadata": {},
"outputs": [],
"source": [
@ -1826,12 +1824,12 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"**Warning**: the following cell will take about 10-15 minutes to run, depending on your hardware:"
"**Warning**: the following cell will take about 10 minutes to run, depending on your hardware:"
]
},
{
"cell_type": "code",
"execution_count": 78,
"execution_count": 77,
"metadata": {},
"outputs": [],
"source": [
@ -1859,7 +1857,7 @@
},
{
"cell_type": "code",
"execution_count": 79,
"execution_count": 78,
"metadata": {},
"outputs": [],
"source": [