From 5663779ae83023cf7ae4992266ebc8b9c9160327 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Geron?= Date: Tue, 2 Mar 2021 09:19:21 +1300 Subject: [PATCH] Use as_frame=False for fetch_open_ml(), and svd_solver=full for PCA, fixes #358 --- 08_dimensionality_reduction.ipynb | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/08_dimensionality_reduction.ipynb b/08_dimensionality_reduction.ipynb index c7f1797..272c8ea 100644 --- a/08_dimensionality_reduction.ipynb +++ b/08_dimensionality_reduction.ipynb @@ -184,7 +184,7 @@ "source": [ "from sklearn.decomposition import PCA\n", "\n", - "pca = PCA(n_components = 2)\n", + "pca = PCA(n_components=2)\n", "X2D = pca.fit_transform(X)" ] }, @@ -761,6 +761,13 @@ "# MNIST compression" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Warning:** since Scikit-Learn 0.24, `fetch_openml()` returns a Pandas `DataFrame` by default. To avoid this and keep the same code as in the book, we set `as_frame=True`." + ] + }, { "cell_type": "code", "execution_count": 31, @@ -769,7 +776,7 @@ "source": [ "from sklearn.datasets import fetch_openml\n", "\n", - "mnist = fetch_openml('mnist_784', version=1)\n", + "mnist = fetch_openml('mnist_784', version=1, as_frame=False)\n", "mnist.target = mnist.target.astype(np.uint8)" ] }, @@ -863,7 +870,7 @@ "metadata": {}, "outputs": [], "source": [ - "pca = PCA(n_components = 154)\n", + "pca = PCA(n_components=154)\n", "X_reduced = pca.fit_transform(X_train)\n", "X_recovered = pca.inverse_transform(X_reduced)" ] @@ -1101,15 +1108,15 @@ "\n", "for n_components in (2, 10, 154):\n", " print(\"n_components =\", n_components)\n", - " regular_pca = PCA(n_components=n_components)\n", + " regular_pca = PCA(n_components=n_components, svd_solver=\"full\")\n", " inc_pca = IncrementalPCA(n_components=n_components, batch_size=500)\n", " rnd_pca = PCA(n_components=n_components, random_state=42, svd_solver=\"randomized\")\n", "\n", - " for pca in (regular_pca, inc_pca, rnd_pca):\n", + " for name, pca in ((\"PCA\", regular_pca), (\"Inc PCA\", inc_pca), (\"Rnd PCA\", rnd_pca)):\n", " t1 = time.time()\n", " pca.fit(X_train)\n", " t2 = time.time()\n", - " print(\" {}: {:.1f} seconds\".format(pca.__class__.__name__, t2 - t1))" + " print(\" {}: {:.1f} seconds\".format(name, t2 - t1))" ] }, { @@ -1130,12 +1137,12 @@ "sizes = [1000, 10000, 20000, 30000, 40000, 50000, 70000, 100000, 200000, 500000]\n", "for n_samples in sizes:\n", " X = np.random.randn(n_samples, 5)\n", - " pca = PCA(n_components = 2, svd_solver=\"randomized\", random_state=42)\n", + " pca = PCA(n_components=2, svd_solver=\"randomized\", random_state=42)\n", " t1 = time.time()\n", " pca.fit(X)\n", " t2 = time.time()\n", " times_rpca.append(t2 - t1)\n", - " pca = PCA(n_components = 2)\n", + " pca = PCA(n_components=2, svd_solver=\"full\")\n", " t1 = time.time()\n", " pca.fit(X)\n", " t2 = time.time()\n", @@ -1169,12 +1176,12 @@ "sizes = [1000, 2000, 3000, 4000, 5000, 6000]\n", "for n_features in sizes:\n", " X = np.random.randn(2000, n_features)\n", - " pca = PCA(n_components = 2, random_state=42, svd_solver=\"randomized\")\n", + " pca = PCA(n_components=2, random_state=42, svd_solver=\"randomized\")\n", " t1 = time.time()\n", " pca.fit(X)\n", " t2 = time.time()\n", " times_rpca.append(t2 - t1)\n", - " pca = PCA(n_components = 2)\n", + " pca = PCA(n_components=2, svd_solver=\"full\")\n", " t1 = time.time()\n", " pca.fit(X)\n", " t2 = time.time()\n", @@ -2252,7 +2259,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.8" + "version": "3.7.9" } }, "nbformat": 4,