42 lines
1.1 KiB
Python
42 lines
1.1 KiB
Python
import pandas as pd
|
|
import seaborn as sns
|
|
import matplotlib.pyplot as plt
|
|
from sklearn.preprocessing import StandardScaler
|
|
from sklearn.metrics import silhouette_score
|
|
from sklearn.cluster import KMeans
|
|
|
|
features = ["Flipper Length (mm)", "Culmen Length (mm)"]
|
|
penguins = pd.read_csv("penguins.csv", usecols=features)
|
|
|
|
'''print(penguins.head())
|
|
print(penguins.info())
|
|
print(penguins.describe())
|
|
|
|
sns.pairplot(penguins.dropna())
|
|
plt.show()'''
|
|
|
|
# gibt es fehlende Werte?
|
|
#print(penguins.isnull().sum())
|
|
penguins.fillna(penguins.mean(numeric_only=True), inplace=True)
|
|
#print(penguins.isnull().sum())
|
|
|
|
X = penguins
|
|
scaler = StandardScaler()
|
|
X_scaled = scaler.fit_transform(X)
|
|
|
|
inertias = []
|
|
k_values = range(2, 7)
|
|
|
|
for k in k_values:
|
|
model = KMeans(n_clusters=k)
|
|
model.fit(X_scaled)
|
|
inertias.append(model.inertia_)
|
|
|
|
df_inertias = pd.DataFrame(dict(x=k_values, y=inertias))
|
|
df_inertias.plot(x="x", y="y", xticks=k_values, grid=True, figsize=(8,6), xlabel="k", ylabel="inertia")
|
|
plt.show()
|
|
|
|
kmeans = KMeans(n_clusters=3)
|
|
clusters = kmeans.fit_predict(X_scaled)
|
|
silhouette_kmeans = silhouette_score(X_scaled, clusters)
|
|
print("KMEans Silhouette", silhouette_kmeans) |