cds1011/clustering/penguins_clustering.py
2026-01-20 09:56:35 +01:00

42 lines
1.1 KiB
Python

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
features = ["Flipper Length (mm)", "Culmen Length (mm)"]
penguins = pd.read_csv("penguins.csv", usecols=features)
'''print(penguins.head())
print(penguins.info())
print(penguins.describe())
sns.pairplot(penguins.dropna())
plt.show()'''
# gibt es fehlende Werte?
#print(penguins.isnull().sum())
penguins.fillna(penguins.mean(numeric_only=True), inplace=True)
#print(penguins.isnull().sum())
X = penguins
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
inertias = []
k_values = range(2, 7)
for k in k_values:
model = KMeans(n_clusters=k)
model.fit(X_scaled)
inertias.append(model.inertia_)
df_inertias = pd.DataFrame(dict(x=k_values, y=inertias))
df_inertias.plot(x="x", y="y", xticks=k_values, grid=True, figsize=(8,6), xlabel="k", ylabel="inertia")
plt.show()
kmeans = KMeans(n_clusters=3)
clusters = kmeans.fit_predict(X_scaled)
silhouette_kmeans = silhouette_score(X_scaled, clusters)
print("KMEans Silhouette", silhouette_kmeans)