# Support Vector Machine Version 2.0 import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV from sklearn.preprocessing import StandardScaler, LabelEncoder from sklearn.svm import SVC from sklearn.metrics import classification_report, confusion_matrix, accuracy_score from sklearn.impute import SimpleImputer import warnings warnings.filterwarnings('ignore') FEATURES = ['Distanz', 'Kalorien', 'Ø Herzfrequenz', 'Maximale Herzfrequenz', 'Aerober TE', 'Training Stress Score®', 'Aktivitätstyp'] NUMERIC_FEATURES = ['Distanz', 'Kalorien', 'Ø Herzfrequenz', 'Maximale Herzfrequenz', 'Aerober TE', 'Training Stress Score®'] def load_dataframe(): try: df = pd.read_csv("Activities_rohdaten.csv", usecols=FEATURES) return df except FileNotFoundError: print("Datei 'Activities_rohdaten.csv' nicht gefunden.") return None # --- Sportart-Klassifikation & Labels --- def preprocess_dataframe(df): def classify_activity(x): x = str(x).lower() if 'kraft' in x: return 0 # Kraftsport if ('rad' in x or 'bike' in x or 'cycling' in x or 'velo' in x or 'schwimm' in x or 'laufen' in x or 'run' in x): return 1 # Ausdauersport return -1 df['sport_category'] = df['Aktivitätstyp'].apply(classify_activity) # Nur gültige Klassen behalten df = df[df['sport_category'] != -1].copy() # Lesbare Labels für Plots und LabelEncoder df['sport_label'] = df['sport_category'].map({ 0: 'Kraftsport', 1: 'Ausdauersport' }) return df # --- Explorative Plots: Countplot, Scatterplot, Korrelations-Heatmap --- def exploratory_plots(df): plt.figure(figsize=(6, 4)) sns.countplot(x='sport_label', data=df) plt.title('Verteilung der Sportarten') plt.xlabel('Sportart') plt.ylabel('Anzahl') plt.tight_layout() plt.savefig('class_countplot.png', dpi=300, bbox_inches='tight') plt.show() available_numeric = [f for f in NUMERIC_FEATURES if f in df.columns] if len(available_numeric) >= 2: feat_x = available_numeric[0] feat_y = available_numeric[1] # in numerisch umwandeln für Plot df_plot = df.copy() for col in [feat_x, feat_y]: df_plot[col] = pd.to_numeric(df_plot[col], errors='coerce') plt.figure(figsize=(6, 5)) sns.scatterplot(data=df_plot, x=feat_x, y=feat_y, hue='sport_label') plt.title(f'Scatterplot: {feat_x} vs. {feat_y}') plt.tight_layout() plt.savefig('scatterplot_features.png', dpi=300, bbox_inches='tight') plt.show() if len(available_numeric) > 1: df_corr = df.copy() for col in available_numeric: df_corr[col] = pd.to_numeric(df_corr[col], errors='coerce') corr = df_corr[available_numeric].corr() plt.figure(figsize=(8, 6)) sns.heatmap(corr, annot=True, fmt=".2f", cmap='coolwarm') plt.title('Korrelations-Heatmap der numerischen Features') plt.tight_layout() plt.savefig('features_correlation_heatmap.png', dpi=300, bbox_inches='tight') plt.show() # --- Feature Engineering (inkl. LabelEncoder & Imputer) --- def feature_engineering(df): df_processed = df.copy() # Numerische Features bereinigen/konvertieren for feature in NUMERIC_FEATURES: if feature in df_processed.columns: if df_processed[feature].dtype == 'object': df_processed[feature] = df_processed[feature].astype(str).str.replace(',', '') df_processed[feature] = pd.to_numeric(df_processed[feature], errors='coerce') # Zeitbasierte Features aus Datum if 'Datum' in df_processed.columns: df_processed['Datum'] = pd.to_datetime(df_processed['Datum'], errors='coerce') df_processed['hour'] = df_processed['Datum'].dt.hour df_processed['day_of_week'] = df_processed['Datum'].dt.dayofweek df_processed['month'] = df_processed['Datum'].dt.month time_features = ['hour', 'day_of_week', 'month'] else: time_features = [] # Alle Features, die ins Modell gehen feature_names = [f for f in NUMERIC_FEATURES if f in df_processed.columns] + time_features X = df_processed[feature_names] # LabelEncoder für sport_label (Kraftsport/Ausdauersport) label_encoder = LabelEncoder() y = label_encoder.fit_transform(df_processed['sport_label']) # Imputer für fehlende Werte imputer = SimpleImputer(strategy='median') X_imputed = imputer.fit_transform(X) return X_imputed, y, feature_names, imputer, label_encoder # --- SVM-Training mit Hyperparameter-Tuning --- def train_svm_model(X, y): X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y) scaler = StandardScaler() X_train_scaled = scaler.fit_transform(X_train) X_test_scaled = scaler.transform(X_test) param_grid = {'C': [0.1, 1, 10, 100], 'gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1], 'kernel': ['rbf', 'linear']} svm = SVC(random_state=42) grid_search = GridSearchCV(svm, param_grid, cv=5, scoring='accuracy', n_jobs=-1) grid_search.fit(X_train_scaled, y_train) best_svm = grid_search.best_estimator_ y_pred = best_svm.predict(X_test_scaled) cv_scores = cross_val_score(best_svm, X_train_scaled, y_train, cv=5) return best_svm, X_test_scaled, y_test, y_pred, grid_search.best_params_, cv_scores, scaler # --- Evaluation --- def evaluate_model(y_test, y_pred, cv_scores, best_params, label_encoder): print("=== SVM-Modell für Sportarten-Klassifikation ===") print(f"Anzahl der Test-Datensätze: {len(y_test)}") print(f"\nBeste Hyperparameter: {best_params}") print(f"\nKreuzvalidierung (CV-Scores): {cv_scores}") print(f"Mittlere CV-Genauigkeit: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})") test_acc = accuracy_score(y_test, y_pred) print(f"\nTest-Genauigkeit: {test_acc:.4f}") class_names = list(label_encoder.classes_) # z.B. ['Ausdauersport', 'Kraftsport'] print("\nKlassifikationsbericht:") print(classification_report(y_test, y_pred, target_names=class_names)) print("\nKonfusionsmatrix:") cm = confusion_matrix(y_test, y_pred) print(cm) # Heatmap der Konfusionsmatrix plt.figure(figsize=(6, 5)) sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_names, yticklabels=class_names) plt.title('Konfusionsmatrix - SVM Sportarten-Klassifikation') plt.ylabel('Wahre Klasse') plt.xlabel('Vorhergesagte Klasse') plt.tight_layout() plt.savefig('confusion_matrix.png', dpi=300, bbox_inches='tight') plt.show() return test_acc # --- Feature-Wichtigkeit (nur lineare SVM) --- def feature_importance_analysis(model, X_test, feature_names): if hasattr(model, 'kernel') and model.kernel == 'linear': importance = np.abs(model.coef_[0]) feature_importance = (pd.DataFrame({'feature': feature_names, 'importance': importance}).sort_values('importance', ascending=False)) print("\nFeature-Wichtigkeit (lineare SVM):") print(feature_importance) plt.figure(figsize=(10, 6)) sns.barplot(data=feature_importance.head(10), x='importance', y='feature') plt.title('Top 10 Feature-Wichtigkeiten') plt.xlabel('Wichtigkeit') plt.tight_layout() plt.savefig('feature_importance.png', dpi=300, bbox_inches='tight') plt.show() else: print("\nFeature-Wichtigkeit nur für lineare SVM verfügbar (aktueller Kernel ist nicht 'linear').") def main(): print("Starte SVM-Modell für Sportarten-Klassifikation...") try: df = load_dataframe() if df is None: return None, None, None, None, None df = preprocess_dataframe(df) print(f"Daten geladen und vorverarbeitet: {df.shape[0]} Datensätze.") print("Verteilung der Sportarten:") print(df['sport_label'].value_counts()) # Explorative Plots: Countplot, Scatterplot, Korrelations-Heatmap exploratory_plots(df) # Feature Engineering X, y, feature_names, imputer, label_encoder = feature_engineering(df) print(f"Feature Engineering abgeschlossen: {X.shape[1]} Features.") # SVM-Modell trainieren model, X_test, y_test, y_pred, best_params, cv_scores, scaler = train_svm_model(X, y) # Evaluation accuracy = evaluate_model(y_test, y_pred, cv_scores, best_params, label_encoder) # Feature-Wichtigkeiten (falls linear) feature_importance_analysis(model, X_test, feature_names) print(f"\nModell erfolgreich trainiert mit Genauigkeit: {accuracy:.4f}") print("\nGespeicherte Dateien:") print("- class_countplot.png") print("- scatterplot_features.png") print("- features_correlation_heatmap.png") print("- confusion_matrix.png") print("- feature_importance.png (falls lineare SVM)") return model, scaler, imputer, feature_names, label_encoder except Exception as e: print(f"Fehler bei der Ausführung: {str(e)}") return None, None, None, None, None if __name__ == "__main__": model, scaler, imputer, feature_names, label_encoder = main()