Support-Vektor-Machine zu binären Klassifikation Version 2.0
Im überarbeiteten Code wurden mehrere strukturelle Verbesserungen und Erweiterungen vorgenommen, um das Machine-Learning-Modell klarer, modularer und datenwissenschaftlich vollständiger aufzubauen. Die wichtigsten Änderungen sind: Einführung zusätzlicher Explorationsdiagramme; - Countplot zur Klassenverteilung - Scattterplot der ersten beiden numerischen Features - Korrelations-Heatmap zur Analyse der Feature Beziehungen LabelEncoder korrekt implementiert Visualisierung der Konfusionsmatrix als Heatmap
This commit is contained in:
parent
e6b8f31c1e
commit
40e97d1b9e
255
svm_modell_2.py
Normal file
255
svm_modell_2.py
Normal file
@ -0,0 +1,255 @@
|
|||||||
|
# Support Vector Machine Version 2.0
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
import seaborn as sns
|
||||||
|
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
|
||||||
|
from sklearn.preprocessing import StandardScaler, LabelEncoder
|
||||||
|
from sklearn.svm import SVC
|
||||||
|
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
|
||||||
|
from sklearn.impute import SimpleImputer
|
||||||
|
import warnings
|
||||||
|
warnings.filterwarnings('ignore')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
FEATURES = ['Distanz', 'Kalorien', 'Ø Herzfrequenz', 'Maximale Herzfrequenz', 'Aerober TE', 'Training Stress Score®', 'Aktivitätstyp']
|
||||||
|
|
||||||
|
NUMERIC_FEATURES = ['Distanz', 'Kalorien', 'Ø Herzfrequenz', 'Maximale Herzfrequenz', 'Aerober TE', 'Training Stress Score®']
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def load_dataframe():
|
||||||
|
try:
|
||||||
|
df = pd.read_csv("Activities_rohdaten.csv", usecols=FEATURES)
|
||||||
|
return df
|
||||||
|
except FileNotFoundError:
|
||||||
|
print("Datei 'Activities_rohdaten.csv' nicht gefunden.")
|
||||||
|
return None
|
||||||
|
|
||||||
|
# --- Sportart-Klassifikation & Labels ---
|
||||||
|
def preprocess_dataframe(df):
|
||||||
|
|
||||||
|
def classify_activity(x):
|
||||||
|
x = str(x).lower()
|
||||||
|
if 'kraft' in x:
|
||||||
|
return 0 # Kraftsport
|
||||||
|
if ('rad' in x or 'bike' in x or 'cycling' in x or 'velo' in x or 'schwimm' in x or 'laufen' in x or 'run' in x):
|
||||||
|
return 1 # Ausdauersport
|
||||||
|
return -1
|
||||||
|
|
||||||
|
df['sport_category'] = df['Aktivitätstyp'].apply(classify_activity)
|
||||||
|
|
||||||
|
# Nur gültige Klassen behalten
|
||||||
|
df = df[df['sport_category'] != -1].copy()
|
||||||
|
|
||||||
|
# Lesbare Labels für Plots und LabelEncoder
|
||||||
|
df['sport_label'] = df['sport_category'].map({
|
||||||
|
0: 'Kraftsport',
|
||||||
|
1: 'Ausdauersport'
|
||||||
|
})
|
||||||
|
|
||||||
|
return df
|
||||||
|
|
||||||
|
# --- Explorative Plots: Countplot, Scatterplot, Korrelations-Heatmap ---
|
||||||
|
def exploratory_plots(df):
|
||||||
|
plt.figure(figsize=(6, 4))
|
||||||
|
sns.countplot(x='sport_label', data=df)
|
||||||
|
plt.title('Verteilung der Sportarten')
|
||||||
|
plt.xlabel('Sportart')
|
||||||
|
plt.ylabel('Anzahl')
|
||||||
|
plt.tight_layout()
|
||||||
|
plt.savefig('class_countplot.png', dpi=300, bbox_inches='tight')
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
available_numeric = [f for f in NUMERIC_FEATURES if f in df.columns]
|
||||||
|
if len(available_numeric) >= 2:
|
||||||
|
feat_x = available_numeric[0]
|
||||||
|
feat_y = available_numeric[1]
|
||||||
|
|
||||||
|
# in numerisch umwandeln für Plot
|
||||||
|
df_plot = df.copy()
|
||||||
|
for col in [feat_x, feat_y]:
|
||||||
|
df_plot[col] = pd.to_numeric(df_plot[col], errors='coerce')
|
||||||
|
|
||||||
|
plt.figure(figsize=(6, 5))
|
||||||
|
sns.scatterplot(data=df_plot, x=feat_x, y=feat_y, hue='sport_label')
|
||||||
|
plt.title(f'Scatterplot: {feat_x} vs. {feat_y}')
|
||||||
|
plt.tight_layout()
|
||||||
|
plt.savefig('scatterplot_features.png', dpi=300, bbox_inches='tight')
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
if len(available_numeric) > 1:
|
||||||
|
df_corr = df.copy()
|
||||||
|
for col in available_numeric:
|
||||||
|
df_corr[col] = pd.to_numeric(df_corr[col], errors='coerce')
|
||||||
|
|
||||||
|
corr = df_corr[available_numeric].corr()
|
||||||
|
plt.figure(figsize=(8, 6))
|
||||||
|
sns.heatmap(corr, annot=True, fmt=".2f", cmap='coolwarm')
|
||||||
|
plt.title('Korrelations-Heatmap der numerischen Features')
|
||||||
|
plt.tight_layout()
|
||||||
|
plt.savefig('features_correlation_heatmap.png', dpi=300, bbox_inches='tight')
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
# --- Feature Engineering (inkl. LabelEncoder & Imputer) ---
|
||||||
|
def feature_engineering(df):
|
||||||
|
df_processed = df.copy()
|
||||||
|
|
||||||
|
# Numerische Features bereinigen/konvertieren
|
||||||
|
for feature in NUMERIC_FEATURES:
|
||||||
|
if feature in df_processed.columns:
|
||||||
|
if df_processed[feature].dtype == 'object':
|
||||||
|
df_processed[feature] = df_processed[feature].astype(str).str.replace(',', '')
|
||||||
|
df_processed[feature] = pd.to_numeric(df_processed[feature], errors='coerce')
|
||||||
|
|
||||||
|
# Zeitbasierte Features aus Datum
|
||||||
|
if 'Datum' in df_processed.columns:
|
||||||
|
df_processed['Datum'] = pd.to_datetime(df_processed['Datum'], errors='coerce')
|
||||||
|
df_processed['hour'] = df_processed['Datum'].dt.hour
|
||||||
|
df_processed['day_of_week'] = df_processed['Datum'].dt.dayofweek
|
||||||
|
df_processed['month'] = df_processed['Datum'].dt.month
|
||||||
|
time_features = ['hour', 'day_of_week', 'month']
|
||||||
|
else:
|
||||||
|
time_features = []
|
||||||
|
|
||||||
|
# Alle Features, die ins Modell gehen
|
||||||
|
feature_names = [f for f in NUMERIC_FEATURES if f in df_processed.columns] + time_features
|
||||||
|
|
||||||
|
X = df_processed[feature_names]
|
||||||
|
|
||||||
|
# LabelEncoder für sport_label (Kraftsport/Ausdauersport)
|
||||||
|
label_encoder = LabelEncoder()
|
||||||
|
y = label_encoder.fit_transform(df_processed['sport_label'])
|
||||||
|
|
||||||
|
# Imputer für fehlende Werte
|
||||||
|
imputer = SimpleImputer(strategy='median')
|
||||||
|
X_imputed = imputer.fit_transform(X)
|
||||||
|
|
||||||
|
return X_imputed, y, feature_names, imputer, label_encoder
|
||||||
|
|
||||||
|
|
||||||
|
# --- SVM-Training mit Hyperparameter-Tuning ---
|
||||||
|
def train_svm_model(X, y):
|
||||||
|
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
|
||||||
|
|
||||||
|
scaler = StandardScaler()
|
||||||
|
X_train_scaled = scaler.fit_transform(X_train)
|
||||||
|
X_test_scaled = scaler.transform(X_test)
|
||||||
|
|
||||||
|
param_grid = {'C': [0.1, 1, 10, 100], 'gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1], 'kernel': ['rbf', 'linear']}
|
||||||
|
|
||||||
|
svm = SVC(random_state=42)
|
||||||
|
grid_search = GridSearchCV(svm, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
|
||||||
|
grid_search.fit(X_train_scaled, y_train)
|
||||||
|
|
||||||
|
best_svm = grid_search.best_estimator_
|
||||||
|
|
||||||
|
y_pred = best_svm.predict(X_test_scaled)
|
||||||
|
|
||||||
|
cv_scores = cross_val_score(best_svm, X_train_scaled, y_train, cv=5)
|
||||||
|
|
||||||
|
return best_svm, X_test_scaled, y_test, y_pred, grid_search.best_params_, cv_scores, scaler
|
||||||
|
|
||||||
|
# --- Evaluation ---
|
||||||
|
def evaluate_model(y_test, y_pred, cv_scores, best_params, label_encoder):
|
||||||
|
print("=== SVM-Modell für Sportarten-Klassifikation ===")
|
||||||
|
print(f"Anzahl der Test-Datensätze: {len(y_test)}")
|
||||||
|
print(f"\nBeste Hyperparameter: {best_params}")
|
||||||
|
|
||||||
|
print(f"\nKreuzvalidierung (CV-Scores): {cv_scores}")
|
||||||
|
print(f"Mittlere CV-Genauigkeit: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
|
||||||
|
|
||||||
|
test_acc = accuracy_score(y_test, y_pred)
|
||||||
|
print(f"\nTest-Genauigkeit: {test_acc:.4f}")
|
||||||
|
|
||||||
|
class_names = list(label_encoder.classes_) # z.B. ['Ausdauersport', 'Kraftsport']
|
||||||
|
|
||||||
|
print("\nKlassifikationsbericht:")
|
||||||
|
print(classification_report(y_test, y_pred, target_names=class_names))
|
||||||
|
|
||||||
|
print("\nKonfusionsmatrix:")
|
||||||
|
cm = confusion_matrix(y_test, y_pred)
|
||||||
|
print(cm)
|
||||||
|
|
||||||
|
# Heatmap der Konfusionsmatrix
|
||||||
|
plt.figure(figsize=(6, 5))
|
||||||
|
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_names, yticklabels=class_names)
|
||||||
|
plt.title('Konfusionsmatrix - SVM Sportarten-Klassifikation')
|
||||||
|
plt.ylabel('Wahre Klasse')
|
||||||
|
plt.xlabel('Vorhergesagte Klasse')
|
||||||
|
plt.tight_layout()
|
||||||
|
plt.savefig('confusion_matrix.png', dpi=300, bbox_inches='tight')
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
return test_acc
|
||||||
|
|
||||||
|
# --- Feature-Wichtigkeit (nur lineare SVM) ---
|
||||||
|
|
||||||
|
def feature_importance_analysis(model, X_test, feature_names):
|
||||||
|
if hasattr(model, 'kernel') and model.kernel == 'linear':
|
||||||
|
importance = np.abs(model.coef_[0])
|
||||||
|
feature_importance = (pd.DataFrame({'feature': feature_names, 'importance': importance}).sort_values('importance', ascending=False))
|
||||||
|
|
||||||
|
print("\nFeature-Wichtigkeit (lineare SVM):")
|
||||||
|
print(feature_importance)
|
||||||
|
|
||||||
|
plt.figure(figsize=(10, 6))
|
||||||
|
sns.barplot(data=feature_importance.head(10), x='importance', y='feature')
|
||||||
|
plt.title('Top 10 Feature-Wichtigkeiten')
|
||||||
|
plt.xlabel('Wichtigkeit')
|
||||||
|
plt.tight_layout()
|
||||||
|
plt.savefig('feature_importance.png', dpi=300, bbox_inches='tight')
|
||||||
|
plt.show()
|
||||||
|
else:
|
||||||
|
print("\nFeature-Wichtigkeit nur für lineare SVM verfügbar (aktueller Kernel ist nicht 'linear').")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
print("Starte SVM-Modell für Sportarten-Klassifikation...")
|
||||||
|
|
||||||
|
try:
|
||||||
|
df = load_dataframe()
|
||||||
|
if df is None:
|
||||||
|
return None, None, None, None, None
|
||||||
|
|
||||||
|
df = preprocess_dataframe(df)
|
||||||
|
print(f"Daten geladen und vorverarbeitet: {df.shape[0]} Datensätze.")
|
||||||
|
print("Verteilung der Sportarten:")
|
||||||
|
print(df['sport_label'].value_counts())
|
||||||
|
|
||||||
|
# Explorative Plots: Countplot, Scatterplot, Korrelations-Heatmap
|
||||||
|
exploratory_plots(df)
|
||||||
|
|
||||||
|
# Feature Engineering
|
||||||
|
X, y, feature_names, imputer, label_encoder = feature_engineering(df)
|
||||||
|
print(f"Feature Engineering abgeschlossen: {X.shape[1]} Features.")
|
||||||
|
|
||||||
|
# SVM-Modell trainieren
|
||||||
|
model, X_test, y_test, y_pred, best_params, cv_scores, scaler = train_svm_model(X, y)
|
||||||
|
|
||||||
|
# Evaluation
|
||||||
|
accuracy = evaluate_model(y_test, y_pred, cv_scores, best_params, label_encoder)
|
||||||
|
|
||||||
|
# Feature-Wichtigkeiten (falls linear)
|
||||||
|
feature_importance_analysis(model, X_test, feature_names)
|
||||||
|
|
||||||
|
print(f"\nModell erfolgreich trainiert mit Genauigkeit: {accuracy:.4f}")
|
||||||
|
print("\nGespeicherte Dateien:")
|
||||||
|
print("- class_countplot.png")
|
||||||
|
print("- scatterplot_features.png")
|
||||||
|
print("- features_correlation_heatmap.png")
|
||||||
|
print("- confusion_matrix.png")
|
||||||
|
print("- feature_importance.png (falls lineare SVM)")
|
||||||
|
|
||||||
|
return model, scaler, imputer, feature_names, label_encoder
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Fehler bei der Ausführung: {str(e)}")
|
||||||
|
return None, None, None, None, None
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
model, scaler, imputer, feature_names, label_encoder = main()
|
||||||
Loading…
x
Reference in New Issue
Block a user