Use as_frame=False for fetch_open_ml(), and svd_solver=full for PCA, fixes #358
parent
9fede98b42
commit
5663779ae8
|
@ -184,7 +184,7 @@
|
||||||
"source": [
|
"source": [
|
||||||
"from sklearn.decomposition import PCA\n",
|
"from sklearn.decomposition import PCA\n",
|
||||||
"\n",
|
"\n",
|
||||||
"pca = PCA(n_components = 2)\n",
|
"pca = PCA(n_components=2)\n",
|
||||||
"X2D = pca.fit_transform(X)"
|
"X2D = pca.fit_transform(X)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
@ -761,6 +761,13 @@
|
||||||
"# MNIST compression"
|
"# MNIST compression"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"**Warning:** since Scikit-Learn 0.24, `fetch_openml()` returns a Pandas `DataFrame` by default. To avoid this and keep the same code as in the book, we set `as_frame=True`."
|
||||||
|
]
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 31,
|
"execution_count": 31,
|
||||||
|
@ -769,7 +776,7 @@
|
||||||
"source": [
|
"source": [
|
||||||
"from sklearn.datasets import fetch_openml\n",
|
"from sklearn.datasets import fetch_openml\n",
|
||||||
"\n",
|
"\n",
|
||||||
"mnist = fetch_openml('mnist_784', version=1)\n",
|
"mnist = fetch_openml('mnist_784', version=1, as_frame=False)\n",
|
||||||
"mnist.target = mnist.target.astype(np.uint8)"
|
"mnist.target = mnist.target.astype(np.uint8)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
@ -863,7 +870,7 @@
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"pca = PCA(n_components = 154)\n",
|
"pca = PCA(n_components=154)\n",
|
||||||
"X_reduced = pca.fit_transform(X_train)\n",
|
"X_reduced = pca.fit_transform(X_train)\n",
|
||||||
"X_recovered = pca.inverse_transform(X_reduced)"
|
"X_recovered = pca.inverse_transform(X_reduced)"
|
||||||
]
|
]
|
||||||
|
@ -1101,15 +1108,15 @@
|
||||||
"\n",
|
"\n",
|
||||||
"for n_components in (2, 10, 154):\n",
|
"for n_components in (2, 10, 154):\n",
|
||||||
" print(\"n_components =\", n_components)\n",
|
" print(\"n_components =\", n_components)\n",
|
||||||
" regular_pca = PCA(n_components=n_components)\n",
|
" regular_pca = PCA(n_components=n_components, svd_solver=\"full\")\n",
|
||||||
" inc_pca = IncrementalPCA(n_components=n_components, batch_size=500)\n",
|
" inc_pca = IncrementalPCA(n_components=n_components, batch_size=500)\n",
|
||||||
" rnd_pca = PCA(n_components=n_components, random_state=42, svd_solver=\"randomized\")\n",
|
" rnd_pca = PCA(n_components=n_components, random_state=42, svd_solver=\"randomized\")\n",
|
||||||
"\n",
|
"\n",
|
||||||
" for pca in (regular_pca, inc_pca, rnd_pca):\n",
|
" for name, pca in ((\"PCA\", regular_pca), (\"Inc PCA\", inc_pca), (\"Rnd PCA\", rnd_pca)):\n",
|
||||||
" t1 = time.time()\n",
|
" t1 = time.time()\n",
|
||||||
" pca.fit(X_train)\n",
|
" pca.fit(X_train)\n",
|
||||||
" t2 = time.time()\n",
|
" t2 = time.time()\n",
|
||||||
" print(\" {}: {:.1f} seconds\".format(pca.__class__.__name__, t2 - t1))"
|
" print(\" {}: {:.1f} seconds\".format(name, t2 - t1))"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -1130,12 +1137,12 @@
|
||||||
"sizes = [1000, 10000, 20000, 30000, 40000, 50000, 70000, 100000, 200000, 500000]\n",
|
"sizes = [1000, 10000, 20000, 30000, 40000, 50000, 70000, 100000, 200000, 500000]\n",
|
||||||
"for n_samples in sizes:\n",
|
"for n_samples in sizes:\n",
|
||||||
" X = np.random.randn(n_samples, 5)\n",
|
" X = np.random.randn(n_samples, 5)\n",
|
||||||
" pca = PCA(n_components = 2, svd_solver=\"randomized\", random_state=42)\n",
|
" pca = PCA(n_components=2, svd_solver=\"randomized\", random_state=42)\n",
|
||||||
" t1 = time.time()\n",
|
" t1 = time.time()\n",
|
||||||
" pca.fit(X)\n",
|
" pca.fit(X)\n",
|
||||||
" t2 = time.time()\n",
|
" t2 = time.time()\n",
|
||||||
" times_rpca.append(t2 - t1)\n",
|
" times_rpca.append(t2 - t1)\n",
|
||||||
" pca = PCA(n_components = 2)\n",
|
" pca = PCA(n_components=2, svd_solver=\"full\")\n",
|
||||||
" t1 = time.time()\n",
|
" t1 = time.time()\n",
|
||||||
" pca.fit(X)\n",
|
" pca.fit(X)\n",
|
||||||
" t2 = time.time()\n",
|
" t2 = time.time()\n",
|
||||||
|
@ -1169,12 +1176,12 @@
|
||||||
"sizes = [1000, 2000, 3000, 4000, 5000, 6000]\n",
|
"sizes = [1000, 2000, 3000, 4000, 5000, 6000]\n",
|
||||||
"for n_features in sizes:\n",
|
"for n_features in sizes:\n",
|
||||||
" X = np.random.randn(2000, n_features)\n",
|
" X = np.random.randn(2000, n_features)\n",
|
||||||
" pca = PCA(n_components = 2, random_state=42, svd_solver=\"randomized\")\n",
|
" pca = PCA(n_components=2, random_state=42, svd_solver=\"randomized\")\n",
|
||||||
" t1 = time.time()\n",
|
" t1 = time.time()\n",
|
||||||
" pca.fit(X)\n",
|
" pca.fit(X)\n",
|
||||||
" t2 = time.time()\n",
|
" t2 = time.time()\n",
|
||||||
" times_rpca.append(t2 - t1)\n",
|
" times_rpca.append(t2 - t1)\n",
|
||||||
" pca = PCA(n_components = 2)\n",
|
" pca = PCA(n_components=2, svd_solver=\"full\")\n",
|
||||||
" t1 = time.time()\n",
|
" t1 = time.time()\n",
|
||||||
" pca.fit(X)\n",
|
" pca.fit(X)\n",
|
||||||
" t2 = time.time()\n",
|
" t2 = time.time()\n",
|
||||||
|
@ -2252,7 +2259,7 @@
|
||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.7.8"
|
"version": "3.7.9"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
|
|
Loading…
Reference in New Issue