Set n_init explicitly when creating KMeans or MiniBatchKMeans, to avoid warning
parent
9b2c0e81c8
commit
1cf75d217b
|
@ -230,7 +230,7 @@
|
||||||
"mapping = {}\n",
|
"mapping = {}\n",
|
||||||
"for class_id in np.unique(y):\n",
|
"for class_id in np.unique(y):\n",
|
||||||
" mode, _ = stats.mode(y_pred[y==class_id])\n",
|
" mode, _ = stats.mode(y_pred[y==class_id])\n",
|
||||||
" mapping[mode[0]] = class_id\n",
|
" mapping[mode] = class_id\n",
|
||||||
"\n",
|
"\n",
|
||||||
"y_pred = np.array([mapping[cluster_id] for cluster_id in y_pred])\n",
|
"y_pred = np.array([mapping[cluster_id] for cluster_id in y_pred])\n",
|
||||||
"\n",
|
"\n",
|
||||||
|
@ -309,10 +309,17 @@
|
||||||
" random_state=7)\n",
|
" random_state=7)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"k = 5\n",
|
"k = 5\n",
|
||||||
"kmeans = KMeans(n_clusters=k, random_state=42)\n",
|
"kmeans = KMeans(n_clusters=k, n_init=10, random_state=42)\n",
|
||||||
"y_pred = kmeans.fit_predict(X)"
|
"y_pred = kmeans.fit_predict(X)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Note: Throughout this notebook, when `n_init` was not set when creating a `KMeans` estimator, I explicitly set it to `n_init=10` to avoid a warning about the fact that the default value for this hyperparameter will change from 10 to `\"auto\"` in Scikit-Learn 1.4."
|
||||||
|
]
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
|
@ -1169,10 +1176,17 @@
|
||||||
"source": [
|
"source": [
|
||||||
"from sklearn.cluster import MiniBatchKMeans\n",
|
"from sklearn.cluster import MiniBatchKMeans\n",
|
||||||
"\n",
|
"\n",
|
||||||
"minibatch_kmeans = MiniBatchKMeans(n_clusters=5, random_state=42)\n",
|
"minibatch_kmeans = MiniBatchKMeans(n_clusters=5, n_init=3, random_state=42)\n",
|
||||||
"minibatch_kmeans.fit(X)"
|
"minibatch_kmeans.fit(X)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Note: Throughout this notebook, when `n_init` was not set when creating a `MiniBatchKMeans` estimator, I explicitly set it to `n_init=3` to avoid a warning about the fact that the default value for this hyperparameter will change from 3 to `\"auto\"` in Scikit-Learn 1.4."
|
||||||
|
]
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 31,
|
"execution_count": 31,
|
||||||
|
@ -1215,7 +1229,7 @@
|
||||||
"source": [
|
"source": [
|
||||||
"from sklearn.datasets import fetch_openml\n",
|
"from sklearn.datasets import fetch_openml\n",
|
||||||
"\n",
|
"\n",
|
||||||
"mnist = fetch_openml('mnist_784', as_frame=False)"
|
"mnist = fetch_openml('mnist_784', as_frame=False, parser=\"auto\")"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -1275,7 +1289,7 @@
|
||||||
"from sklearn.cluster import MiniBatchKMeans\n",
|
"from sklearn.cluster import MiniBatchKMeans\n",
|
||||||
"\n",
|
"\n",
|
||||||
"minibatch_kmeans = MiniBatchKMeans(n_clusters=10, batch_size=10,\n",
|
"minibatch_kmeans = MiniBatchKMeans(n_clusters=10, batch_size=10,\n",
|
||||||
" random_state=42)\n",
|
" n_init=3, random_state=42)\n",
|
||||||
"minibatch_kmeans.fit(X_memmap)"
|
"minibatch_kmeans.fit(X_memmap)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
@ -1320,8 +1334,8 @@
|
||||||
"times = np.empty((max_k, 2))\n",
|
"times = np.empty((max_k, 2))\n",
|
||||||
"inertias = np.empty((max_k, 2))\n",
|
"inertias = np.empty((max_k, 2))\n",
|
||||||
"for k in range(1, max_k + 1):\n",
|
"for k in range(1, max_k + 1):\n",
|
||||||
" kmeans_ = KMeans(n_clusters=k, algorithm=\"full\", random_state=42)\n",
|
" kmeans_ = KMeans(n_clusters=k, algorithm=\"lloyd\", n_init=10, random_state=42)\n",
|
||||||
" minibatch_kmeans = MiniBatchKMeans(n_clusters=k, random_state=42)\n",
|
" minibatch_kmeans = MiniBatchKMeans(n_clusters=k, n_init=10, random_state=42)\n",
|
||||||
" print(f\"\\r{k}/{max_k}\", end=\"\") # \\r returns to the start of line\n",
|
" print(f\"\\r{k}/{max_k}\", end=\"\") # \\r returns to the start of line\n",
|
||||||
" times[k - 1, 0] = timeit(\"kmeans_.fit(X)\", number=10, globals=globals())\n",
|
" times[k - 1, 0] = timeit(\"kmeans_.fit(X)\", number=10, globals=globals())\n",
|
||||||
" times[k - 1, 1] = timeit(\"minibatch_kmeans.fit(X)\", number=10,\n",
|
" times[k - 1, 1] = timeit(\"minibatch_kmeans.fit(X)\", number=10,\n",
|
||||||
|
@ -1387,8 +1401,8 @@
|
||||||
"source": [
|
"source": [
|
||||||
"# extra code – this cell generates and saves Figure 9–7\n",
|
"# extra code – this cell generates and saves Figure 9–7\n",
|
||||||
"\n",
|
"\n",
|
||||||
"kmeans_k3 = KMeans(n_clusters=3, random_state=42)\n",
|
"kmeans_k3 = KMeans(n_clusters=3, n_init=10, random_state=42)\n",
|
||||||
"kmeans_k8 = KMeans(n_clusters=8, random_state=42)\n",
|
"kmeans_k8 = KMeans(n_clusters=8, n_init=10, random_state=42)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"plot_clusterer_comparison(kmeans_k3, kmeans_k8, X, \"$k=3$\", \"$k=8$\")\n",
|
"plot_clusterer_comparison(kmeans_k3, kmeans_k8, X, \"$k=3$\", \"$k=8$\")\n",
|
||||||
"save_fig(\"bad_n_clusters_plot\")\n",
|
"save_fig(\"bad_n_clusters_plot\")\n",
|
||||||
|
@ -1470,7 +1484,7 @@
|
||||||
"source": [
|
"source": [
|
||||||
"# extra code – this cell generates and saves Figure 9–8\n",
|
"# extra code – this cell generates and saves Figure 9–8\n",
|
||||||
"\n",
|
"\n",
|
||||||
"kmeans_per_k = [KMeans(n_clusters=k, random_state=42).fit(X)\n",
|
"kmeans_per_k = [KMeans(n_clusters=k, n_init=10, random_state=42).fit(X)\n",
|
||||||
" for k in range(1, 10)]\n",
|
" for k in range(1, 10)]\n",
|
||||||
"inertias = [model.inertia_ for model in kmeans_per_k]\n",
|
"inertias = [model.inertia_ for model in kmeans_per_k]\n",
|
||||||
"\n",
|
"\n",
|
||||||
|
@ -1724,7 +1738,7 @@
|
||||||
"kmeans_good = KMeans(n_clusters=3,\n",
|
"kmeans_good = KMeans(n_clusters=3,\n",
|
||||||
" init=np.array([[-1.5, 2.5], [0.5, 0], [4, 0]]),\n",
|
" init=np.array([[-1.5, 2.5], [0.5, 0], [4, 0]]),\n",
|
||||||
" n_init=1, random_state=42)\n",
|
" n_init=1, random_state=42)\n",
|
||||||
"kmeans_bad = KMeans(n_clusters=3, random_state=42)\n",
|
"kmeans_bad = KMeans(n_clusters=3, n_init=10, random_state=42)\n",
|
||||||
"kmeans_good.fit(X)\n",
|
"kmeans_good.fit(X)\n",
|
||||||
"kmeans_bad.fit(X)\n",
|
"kmeans_bad.fit(X)\n",
|
||||||
"\n",
|
"\n",
|
||||||
|
@ -1805,7 +1819,7 @@
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"X = image.reshape(-1, 3)\n",
|
"X = image.reshape(-1, 3)\n",
|
||||||
"kmeans = KMeans(n_clusters=8, random_state=42).fit(X)\n",
|
"kmeans = KMeans(n_clusters=8, n_init=10, random_state=42).fit(X)\n",
|
||||||
"segmented_img = kmeans.cluster_centers_[kmeans.labels_]\n",
|
"segmented_img = kmeans.cluster_centers_[kmeans.labels_]\n",
|
||||||
"segmented_img = segmented_img.reshape(image.shape)"
|
"segmented_img = segmented_img.reshape(image.shape)"
|
||||||
]
|
]
|
||||||
|
@ -1834,7 +1848,7 @@
|
||||||
"segmented_imgs = []\n",
|
"segmented_imgs = []\n",
|
||||||
"n_colors = (10, 8, 6, 4, 2)\n",
|
"n_colors = (10, 8, 6, 4, 2)\n",
|
||||||
"for n_clusters in n_colors:\n",
|
"for n_clusters in n_colors:\n",
|
||||||
" kmeans = KMeans(n_clusters=n_clusters, random_state=42).fit(X)\n",
|
" kmeans = KMeans(n_clusters=n_clusters, n_init=10, random_state=42).fit(X)\n",
|
||||||
" segmented_img = kmeans.cluster_centers_[kmeans.labels_]\n",
|
" segmented_img = kmeans.cluster_centers_[kmeans.labels_]\n",
|
||||||
" segmented_imgs.append(segmented_img.reshape(image.shape))\n",
|
" segmented_imgs.append(segmented_img.reshape(image.shape))\n",
|
||||||
"\n",
|
"\n",
|
||||||
|
@ -1978,7 +1992,7 @@
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"k = 50\n",
|
"k = 50\n",
|
||||||
"kmeans = KMeans(n_clusters=k, random_state=42)\n",
|
"kmeans = KMeans(n_clusters=k, n_init=10, random_state=42)\n",
|
||||||
"X_digits_dist = kmeans.fit_transform(X_train)\n",
|
"X_digits_dist = kmeans.fit_transform(X_train)\n",
|
||||||
"representative_digit_idx = X_digits_dist.argmin(axis=0)\n",
|
"representative_digit_idx = X_digits_dist.argmin(axis=0)\n",
|
||||||
"X_representative_digits = X_train[representative_digit_idx]"
|
"X_representative_digits = X_train[representative_digit_idx]"
|
||||||
|
@ -2623,8 +2637,7 @@
|
||||||
"source": [
|
"source": [
|
||||||
"def plot_spectral_clustering(sc, X, size, alpha, show_xlabels=True,\n",
|
"def plot_spectral_clustering(sc, X, size, alpha, show_xlabels=True,\n",
|
||||||
" show_ylabels=True):\n",
|
" show_ylabels=True):\n",
|
||||||
" plt.scatter(X[:, 0], X[:, 1], marker='o', s=size, c='gray', cmap=\"Paired\",\n",
|
" plt.scatter(X[:, 0], X[:, 1], marker='o', s=size, c='gray', alpha=alpha)\n",
|
||||||
" alpha=alpha)\n",
|
|
||||||
" plt.scatter(X[:, 0], X[:, 1], marker='o', s=30, c='w')\n",
|
" plt.scatter(X[:, 0], X[:, 1], marker='o', s=30, c='w')\n",
|
||||||
" plt.scatter(X[:, 0], X[:, 1], marker='.', s=10, c=sc.labels_, cmap=\"Paired\")\n",
|
" plt.scatter(X[:, 0], X[:, 1], marker='.', s=10, c=sc.labels_, cmap=\"Paired\")\n",
|
||||||
" \n",
|
" \n",
|
||||||
|
@ -4005,7 +4018,7 @@
|
||||||
"kmeans_per_k = []\n",
|
"kmeans_per_k = []\n",
|
||||||
"for k in k_range:\n",
|
"for k in k_range:\n",
|
||||||
" print(f\"k={k}\")\n",
|
" print(f\"k={k}\")\n",
|
||||||
" kmeans = KMeans(n_clusters=k, random_state=42)\n",
|
" kmeans = KMeans(n_clusters=k, n_init=10, random_state=42)\n",
|
||||||
" kmeans.fit(X_train_pca)\n",
|
" kmeans.fit(X_train_pca)\n",
|
||||||
" kmeans_per_k.append(kmeans)"
|
" kmeans_per_k.append(kmeans)"
|
||||||
]
|
]
|
||||||
|
@ -6581,7 +6594,7 @@
|
||||||
"\n",
|
"\n",
|
||||||
"for n_clusters in k_range:\n",
|
"for n_clusters in k_range:\n",
|
||||||
" pipeline = make_pipeline(\n",
|
" pipeline = make_pipeline(\n",
|
||||||
" KMeans(n_clusters=n_clusters, random_state=42),\n",
|
" KMeans(n_clusters=n_clusters, n_init=10, random_state=42),\n",
|
||||||
" RandomForestClassifier(n_estimators=150, random_state=42)\n",
|
" RandomForestClassifier(n_estimators=150, random_state=42)\n",
|
||||||
" )\n",
|
" )\n",
|
||||||
" pipeline.fit(X_train_pca, y_train)\n",
|
" pipeline.fit(X_train_pca, y_train)\n",
|
||||||
|
@ -6971,7 +6984,7 @@
|
||||||
],
|
],
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"kernelspec": {
|
"kernelspec": {
|
||||||
"display_name": "Python 3",
|
"display_name": "Python 3 (ipykernel)",
|
||||||
"language": "python",
|
"language": "python",
|
||||||
"name": "python3"
|
"name": "python3"
|
||||||
},
|
},
|
||||||
|
@ -6985,7 +6998,7 @@
|
||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.10.6"
|
"version": "3.10.13"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
|
|
Loading…
Reference in New Issue