diff --git a/09_unsupervised_learning.ipynb b/09_unsupervised_learning.ipynb index fc1ea89..ebbf3ac 100644 --- a/09_unsupervised_learning.ipynb +++ b/09_unsupervised_learning.ipynb @@ -230,7 +230,7 @@ "mapping = {}\n", "for class_id in np.unique(y):\n", " mode, _ = stats.mode(y_pred[y==class_id])\n", - " mapping[mode[0]] = class_id\n", + " mapping[mode] = class_id\n", "\n", "y_pred = np.array([mapping[cluster_id] for cluster_id in y_pred])\n", "\n", @@ -309,10 +309,17 @@ " random_state=7)\n", "\n", "k = 5\n", - "kmeans = KMeans(n_clusters=k, random_state=42)\n", + "kmeans = KMeans(n_clusters=k, n_init=10, random_state=42)\n", "y_pred = kmeans.fit_predict(X)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note: Throughout this notebook, when `n_init` was not set when creating a `KMeans` estimator, I explicitly set it to `n_init=10` to avoid a warning about the fact that the default value for this hyperparameter will change from 10 to `\"auto\"` in Scikit-Learn 1.4." + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -1169,10 +1176,17 @@ "source": [ "from sklearn.cluster import MiniBatchKMeans\n", "\n", - "minibatch_kmeans = MiniBatchKMeans(n_clusters=5, random_state=42)\n", + "minibatch_kmeans = MiniBatchKMeans(n_clusters=5, n_init=3, random_state=42)\n", "minibatch_kmeans.fit(X)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note: Throughout this notebook, when `n_init` was not set when creating a `MiniBatchKMeans` estimator, I explicitly set it to `n_init=3` to avoid a warning about the fact that the default value for this hyperparameter will change from 3 to `\"auto\"` in Scikit-Learn 1.4." + ] + }, { "cell_type": "code", "execution_count": 31, @@ -1215,7 +1229,7 @@ "source": [ "from sklearn.datasets import fetch_openml\n", "\n", - "mnist = fetch_openml('mnist_784', as_frame=False)" + "mnist = fetch_openml('mnist_784', as_frame=False, parser=\"auto\")" ] }, { @@ -1275,7 +1289,7 @@ "from sklearn.cluster import MiniBatchKMeans\n", "\n", "minibatch_kmeans = MiniBatchKMeans(n_clusters=10, batch_size=10,\n", - " random_state=42)\n", + " n_init=3, random_state=42)\n", "minibatch_kmeans.fit(X_memmap)" ] }, @@ -1320,8 +1334,8 @@ "times = np.empty((max_k, 2))\n", "inertias = np.empty((max_k, 2))\n", "for k in range(1, max_k + 1):\n", - " kmeans_ = KMeans(n_clusters=k, algorithm=\"full\", random_state=42)\n", - " minibatch_kmeans = MiniBatchKMeans(n_clusters=k, random_state=42)\n", + " kmeans_ = KMeans(n_clusters=k, algorithm=\"lloyd\", n_init=10, random_state=42)\n", + " minibatch_kmeans = MiniBatchKMeans(n_clusters=k, n_init=10, random_state=42)\n", " print(f\"\\r{k}/{max_k}\", end=\"\") # \\r returns to the start of line\n", " times[k - 1, 0] = timeit(\"kmeans_.fit(X)\", number=10, globals=globals())\n", " times[k - 1, 1] = timeit(\"minibatch_kmeans.fit(X)\", number=10,\n", @@ -1387,8 +1401,8 @@ "source": [ "# extra code – this cell generates and saves Figure 9–7\n", "\n", - "kmeans_k3 = KMeans(n_clusters=3, random_state=42)\n", - "kmeans_k8 = KMeans(n_clusters=8, random_state=42)\n", + "kmeans_k3 = KMeans(n_clusters=3, n_init=10, random_state=42)\n", + "kmeans_k8 = KMeans(n_clusters=8, n_init=10, random_state=42)\n", "\n", "plot_clusterer_comparison(kmeans_k3, kmeans_k8, X, \"$k=3$\", \"$k=8$\")\n", "save_fig(\"bad_n_clusters_plot\")\n", @@ -1470,7 +1484,7 @@ "source": [ "# extra code – this cell generates and saves Figure 9–8\n", "\n", - "kmeans_per_k = [KMeans(n_clusters=k, random_state=42).fit(X)\n", + "kmeans_per_k = [KMeans(n_clusters=k, n_init=10, random_state=42).fit(X)\n", " for k in range(1, 10)]\n", "inertias = [model.inertia_ for model in kmeans_per_k]\n", "\n", @@ -1724,7 +1738,7 @@ "kmeans_good = KMeans(n_clusters=3,\n", " init=np.array([[-1.5, 2.5], [0.5, 0], [4, 0]]),\n", " n_init=1, random_state=42)\n", - "kmeans_bad = KMeans(n_clusters=3, random_state=42)\n", + "kmeans_bad = KMeans(n_clusters=3, n_init=10, random_state=42)\n", "kmeans_good.fit(X)\n", "kmeans_bad.fit(X)\n", "\n", @@ -1805,7 +1819,7 @@ "outputs": [], "source": [ "X = image.reshape(-1, 3)\n", - "kmeans = KMeans(n_clusters=8, random_state=42).fit(X)\n", + "kmeans = KMeans(n_clusters=8, n_init=10, random_state=42).fit(X)\n", "segmented_img = kmeans.cluster_centers_[kmeans.labels_]\n", "segmented_img = segmented_img.reshape(image.shape)" ] @@ -1834,7 +1848,7 @@ "segmented_imgs = []\n", "n_colors = (10, 8, 6, 4, 2)\n", "for n_clusters in n_colors:\n", - " kmeans = KMeans(n_clusters=n_clusters, random_state=42).fit(X)\n", + " kmeans = KMeans(n_clusters=n_clusters, n_init=10, random_state=42).fit(X)\n", " segmented_img = kmeans.cluster_centers_[kmeans.labels_]\n", " segmented_imgs.append(segmented_img.reshape(image.shape))\n", "\n", @@ -1978,7 +1992,7 @@ "outputs": [], "source": [ "k = 50\n", - "kmeans = KMeans(n_clusters=k, random_state=42)\n", + "kmeans = KMeans(n_clusters=k, n_init=10, random_state=42)\n", "X_digits_dist = kmeans.fit_transform(X_train)\n", "representative_digit_idx = X_digits_dist.argmin(axis=0)\n", "X_representative_digits = X_train[representative_digit_idx]" @@ -2623,8 +2637,7 @@ "source": [ "def plot_spectral_clustering(sc, X, size, alpha, show_xlabels=True,\n", " show_ylabels=True):\n", - " plt.scatter(X[:, 0], X[:, 1], marker='o', s=size, c='gray', cmap=\"Paired\",\n", - " alpha=alpha)\n", + " plt.scatter(X[:, 0], X[:, 1], marker='o', s=size, c='gray', alpha=alpha)\n", " plt.scatter(X[:, 0], X[:, 1], marker='o', s=30, c='w')\n", " plt.scatter(X[:, 0], X[:, 1], marker='.', s=10, c=sc.labels_, cmap=\"Paired\")\n", " \n", @@ -4005,7 +4018,7 @@ "kmeans_per_k = []\n", "for k in k_range:\n", " print(f\"k={k}\")\n", - " kmeans = KMeans(n_clusters=k, random_state=42)\n", + " kmeans = KMeans(n_clusters=k, n_init=10, random_state=42)\n", " kmeans.fit(X_train_pca)\n", " kmeans_per_k.append(kmeans)" ] @@ -6581,7 +6594,7 @@ "\n", "for n_clusters in k_range:\n", " pipeline = make_pipeline(\n", - " KMeans(n_clusters=n_clusters, random_state=42),\n", + " KMeans(n_clusters=n_clusters, n_init=10, random_state=42),\n", " RandomForestClassifier(n_estimators=150, random_state=42)\n", " )\n", " pipeline.fit(X_train_pca, y_train)\n", @@ -6971,7 +6984,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -6985,7 +6998,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.6" + "version": "3.10.13" } }, "nbformat": 4,