From 40e97d1b9ee3639038e4a23f2f31eb610dd67fec Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adu=20Alex=20G=C3=B6llnitz?=
 <adualex.goellnitz@stud.fhgr.ch>
Date: Mon, 24 Nov 2025 13:56:52 +0100
Subject: [PATCH] =?UTF-8?q?Support-Vektor-Machine=20zu=20bin=C3=A4ren=20Kl?=
 =?UTF-8?q?assifikation=20Version=202.0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Im überarbeiteten Code wurden mehrere strukturelle Verbesserungen und Erweiterungen vorgenommen, um das Machine-Learning-Modell klarer, modularer und datenwissenschaftlich vollständiger aufzubauen. Die wichtigsten Änderungen sind:

Einführung zusätzlicher Explorationsdiagramme;
- Countplot zur Klassenverteilung
- Scattterplot der ersten beiden numerischen Features
- Korrelations-Heatmap zur Analyse der Feature Beziehungen

LabelEncoder korrekt implementiert

Visualisierung der Konfusionsmatrix als Heatmap
---
 svm_modell_2.py | 255 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 255 insertions(+)
 create mode 100644 svm_modell_2.py

diff --git a/svm_modell_2.py b/svm_modell_2.py
new file mode 100644
index 0000000..730cbdb
--- /dev/null
+++ b/svm_modell_2.py
@@ -0,0 +1,255 @@
+# Support Vector Machine Version 2.0
+
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
+from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
+from sklearn.preprocessing import StandardScaler, LabelEncoder
+from sklearn.svm import SVC
+from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
+from sklearn.impute import SimpleImputer
+import warnings
+warnings.filterwarnings('ignore')
+
+
+
+
+FEATURES = ['Distanz', 'Kalorien', 'Ø Herzfrequenz', 'Maximale Herzfrequenz', 'Aerober TE', 'Training Stress Score®', 'Aktivitätstyp']
+
+NUMERIC_FEATURES = ['Distanz', 'Kalorien', 'Ø Herzfrequenz', 'Maximale Herzfrequenz', 'Aerober TE', 'Training Stress Score®']
+
+
+
+def load_dataframe():
+    try:
+        df = pd.read_csv("Activities_rohdaten.csv", usecols=FEATURES)
+        return df
+    except FileNotFoundError:
+        print("Datei 'Activities_rohdaten.csv' nicht gefunden.")
+        return None
+
+# --- Sportart-Klassifikation & Labels ---
+def preprocess_dataframe(df):
+
+    def classify_activity(x):
+        x = str(x).lower()
+        if 'kraft' in x:
+            return 0                   # Kraftsport
+        if ('rad' in x or 'bike' in x or 'cycling' in x or 'velo' in x or 'schwimm' in x or 'laufen' in x or 'run' in x):
+            return 1                   # Ausdauersport
+        return -1                      
+
+    df['sport_category'] = df['Aktivitätstyp'].apply(classify_activity)
+
+    # Nur gültige Klassen behalten
+    df = df[df['sport_category'] != -1].copy()
+
+    # Lesbare Labels für Plots und LabelEncoder
+    df['sport_label'] = df['sport_category'].map({
+        0: 'Kraftsport',
+        1: 'Ausdauersport'
+    })
+
+    return df
+
+# --- Explorative Plots: Countplot, Scatterplot, Korrelations-Heatmap ---
+def exploratory_plots(df):
+    plt.figure(figsize=(6, 4))
+    sns.countplot(x='sport_label', data=df)
+    plt.title('Verteilung der Sportarten')
+    plt.xlabel('Sportart')
+    plt.ylabel('Anzahl')
+    plt.tight_layout()
+    plt.savefig('class_countplot.png', dpi=300, bbox_inches='tight')
+    plt.show()
+
+    available_numeric = [f for f in NUMERIC_FEATURES if f in df.columns]
+    if len(available_numeric) >= 2:
+        feat_x = available_numeric[0]
+        feat_y = available_numeric[1]
+
+        # in numerisch umwandeln für Plot
+        df_plot = df.copy()
+        for col in [feat_x, feat_y]:
+            df_plot[col] = pd.to_numeric(df_plot[col], errors='coerce')
+
+        plt.figure(figsize=(6, 5))
+        sns.scatterplot(data=df_plot, x=feat_x, y=feat_y, hue='sport_label')
+        plt.title(f'Scatterplot: {feat_x} vs. {feat_y}')
+        plt.tight_layout()
+        plt.savefig('scatterplot_features.png', dpi=300, bbox_inches='tight')
+        plt.show()
+
+    if len(available_numeric) > 1:
+        df_corr = df.copy()
+        for col in available_numeric:
+            df_corr[col] = pd.to_numeric(df_corr[col], errors='coerce')
+
+        corr = df_corr[available_numeric].corr()
+        plt.figure(figsize=(8, 6))
+        sns.heatmap(corr, annot=True, fmt=".2f", cmap='coolwarm')
+        plt.title('Korrelations-Heatmap der numerischen Features')
+        plt.tight_layout()
+        plt.savefig('features_correlation_heatmap.png', dpi=300, bbox_inches='tight')
+        plt.show()
+
+# --- Feature Engineering (inkl. LabelEncoder & Imputer) ---
+def feature_engineering(df):
+    df_processed = df.copy()
+
+    # Numerische Features bereinigen/konvertieren
+    for feature in NUMERIC_FEATURES:
+        if feature in df_processed.columns:
+            if df_processed[feature].dtype == 'object':
+                df_processed[feature] = df_processed[feature].astype(str).str.replace(',', '')
+            df_processed[feature] = pd.to_numeric(df_processed[feature], errors='coerce')
+
+    # Zeitbasierte Features aus Datum
+    if 'Datum' in df_processed.columns:
+        df_processed['Datum'] = pd.to_datetime(df_processed['Datum'], errors='coerce')
+        df_processed['hour'] = df_processed['Datum'].dt.hour
+        df_processed['day_of_week'] = df_processed['Datum'].dt.dayofweek
+        df_processed['month'] = df_processed['Datum'].dt.month
+        time_features = ['hour', 'day_of_week', 'month']
+    else:
+        time_features = []
+
+    # Alle Features, die ins Modell gehen
+    feature_names = [f for f in NUMERIC_FEATURES if f in df_processed.columns] + time_features
+
+    X = df_processed[feature_names]
+
+    # LabelEncoder für sport_label (Kraftsport/Ausdauersport)
+    label_encoder = LabelEncoder()
+    y = label_encoder.fit_transform(df_processed['sport_label'])
+
+    # Imputer für fehlende Werte
+    imputer = SimpleImputer(strategy='median')
+    X_imputed = imputer.fit_transform(X)
+
+    return X_imputed, y, feature_names, imputer, label_encoder
+
+
+# --- SVM-Training mit Hyperparameter-Tuning ---
+def train_svm_model(X, y):
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
+
+    scaler = StandardScaler()
+    X_train_scaled = scaler.fit_transform(X_train)
+    X_test_scaled = scaler.transform(X_test)
+
+    param_grid = {'C': [0.1, 1, 10, 100], 'gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1], 'kernel': ['rbf', 'linear']}
+
+    svm = SVC(random_state=42)
+    grid_search = GridSearchCV(svm, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
+    grid_search.fit(X_train_scaled, y_train)
+
+    best_svm = grid_search.best_estimator_
+
+    y_pred = best_svm.predict(X_test_scaled)
+
+    cv_scores = cross_val_score(best_svm, X_train_scaled, y_train, cv=5)
+
+    return best_svm, X_test_scaled, y_test, y_pred, grid_search.best_params_, cv_scores, scaler
+
+# --- Evaluation ---
+def evaluate_model(y_test, y_pred, cv_scores, best_params, label_encoder):
+    print("=== SVM-Modell für Sportarten-Klassifikation ===")
+    print(f"Anzahl der Test-Datensätze: {len(y_test)}")
+    print(f"\nBeste Hyperparameter: {best_params}")
+
+    print(f"\nKreuzvalidierung (CV-Scores): {cv_scores}")
+    print(f"Mittlere CV-Genauigkeit: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
+
+    test_acc = accuracy_score(y_test, y_pred)
+    print(f"\nTest-Genauigkeit: {test_acc:.4f}")
+
+    class_names = list(label_encoder.classes_)   # z.B. ['Ausdauersport', 'Kraftsport']
+
+    print("\nKlassifikationsbericht:")
+    print(classification_report(y_test, y_pred, target_names=class_names))
+
+    print("\nKonfusionsmatrix:")
+    cm = confusion_matrix(y_test, y_pred)
+    print(cm)
+
+    # Heatmap der Konfusionsmatrix
+    plt.figure(figsize=(6, 5))
+    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_names, yticklabels=class_names)
+    plt.title('Konfusionsmatrix - SVM Sportarten-Klassifikation')
+    plt.ylabel('Wahre Klasse')
+    plt.xlabel('Vorhergesagte Klasse')
+    plt.tight_layout()
+    plt.savefig('confusion_matrix.png', dpi=300, bbox_inches='tight')
+    plt.show()
+
+    return test_acc
+
+# --- Feature-Wichtigkeit (nur lineare SVM) ---
+
+def feature_importance_analysis(model, X_test, feature_names):
+    if hasattr(model, 'kernel') and model.kernel == 'linear':
+        importance = np.abs(model.coef_[0])
+        feature_importance = (pd.DataFrame({'feature': feature_names, 'importance': importance}).sort_values('importance', ascending=False))
+
+        print("\nFeature-Wichtigkeit (lineare SVM):")
+        print(feature_importance)
+
+        plt.figure(figsize=(10, 6))
+        sns.barplot(data=feature_importance.head(10), x='importance', y='feature')
+        plt.title('Top 10 Feature-Wichtigkeiten')
+        plt.xlabel('Wichtigkeit')
+        plt.tight_layout()
+        plt.savefig('feature_importance.png', dpi=300, bbox_inches='tight')
+        plt.show()
+    else:
+        print("\nFeature-Wichtigkeit nur für lineare SVM verfügbar (aktueller Kernel ist nicht 'linear').")
+
+
+
+def main():
+    print("Starte SVM-Modell für Sportarten-Klassifikation...")
+
+    try:
+        df = load_dataframe()
+        if df is None:
+            return None, None, None, None, None
+
+        df = preprocess_dataframe(df)
+        print(f"Daten geladen und vorverarbeitet: {df.shape[0]} Datensätze.")
+        print("Verteilung der Sportarten:")
+        print(df['sport_label'].value_counts())
+
+        # Explorative Plots: Countplot, Scatterplot, Korrelations-Heatmap
+        exploratory_plots(df)
+
+        # Feature Engineering
+        X, y, feature_names, imputer, label_encoder = feature_engineering(df)
+        print(f"Feature Engineering abgeschlossen: {X.shape[1]} Features.")
+
+        # SVM-Modell trainieren
+        model, X_test, y_test, y_pred, best_params, cv_scores, scaler = train_svm_model(X, y)
+
+        # Evaluation
+        accuracy = evaluate_model(y_test, y_pred, cv_scores, best_params, label_encoder)
+
+        # Feature-Wichtigkeiten (falls linear)
+        feature_importance_analysis(model, X_test, feature_names)
+
+        print(f"\nModell erfolgreich trainiert mit Genauigkeit: {accuracy:.4f}")
+        print("\nGespeicherte Dateien:")
+        print("- class_countplot.png")
+        print("- scatterplot_features.png")
+        print("- features_correlation_heatmap.png")
+        print("- confusion_matrix.png")
+        print("- feature_importance.png (falls lineare SVM)")
+
+        return model, scaler, imputer, feature_names, label_encoder
+
+    except Exception as e:
+        print(f"Fehler bei der Ausführung: {str(e)}")
+        return None, None, None, None, None
+
+if __name__ == "__main__":
+    model, scaler, imputer, feature_names, label_encoder = main()