From 40e97d1b9ee3639038e4a23f2f31eb610dd67fec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adu=20Alex=20G=C3=B6llnitz?= Date: Mon, 24 Nov 2025 13:56:52 +0100 Subject: [PATCH] =?UTF-8?q?Support-Vektor-Machine=20zu=20bin=C3=A4ren=20Kl?= =?UTF-8?q?assifikation=20Version=202.0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Im überarbeiteten Code wurden mehrere strukturelle Verbesserungen und Erweiterungen vorgenommen, um das Machine-Learning-Modell klarer, modularer und datenwissenschaftlich vollständiger aufzubauen. Die wichtigsten Änderungen sind: Einführung zusätzlicher Explorationsdiagramme; - Countplot zur Klassenverteilung - Scattterplot der ersten beiden numerischen Features - Korrelations-Heatmap zur Analyse der Feature Beziehungen LabelEncoder korrekt implementiert Visualisierung der Konfusionsmatrix als Heatmap --- svm_modell_2.py | 255 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 255 insertions(+) create mode 100644 svm_modell_2.py diff --git a/svm_modell_2.py b/svm_modell_2.py new file mode 100644 index 0000000..730cbdb --- /dev/null +++ b/svm_modell_2.py @@ -0,0 +1,255 @@ +# Support Vector Machine Version 2.0 + +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +import seaborn as sns +from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV +from sklearn.preprocessing import StandardScaler, LabelEncoder +from sklearn.svm import SVC +from sklearn.metrics import classification_report, confusion_matrix, accuracy_score +from sklearn.impute import SimpleImputer +import warnings +warnings.filterwarnings('ignore') + + + + +FEATURES = ['Distanz', 'Kalorien', 'Ø Herzfrequenz', 'Maximale Herzfrequenz', 'Aerober TE', 'Training Stress Score®', 'Aktivitätstyp'] + +NUMERIC_FEATURES = ['Distanz', 'Kalorien', 'Ø Herzfrequenz', 'Maximale Herzfrequenz', 'Aerober TE', 'Training Stress Score®'] + + + +def load_dataframe(): + try: + df = pd.read_csv("Activities_rohdaten.csv", usecols=FEATURES) + return df + except FileNotFoundError: + print("Datei 'Activities_rohdaten.csv' nicht gefunden.") + return None + +# --- Sportart-Klassifikation & Labels --- +def preprocess_dataframe(df): + + def classify_activity(x): + x = str(x).lower() + if 'kraft' in x: + return 0 # Kraftsport + if ('rad' in x or 'bike' in x or 'cycling' in x or 'velo' in x or 'schwimm' in x or 'laufen' in x or 'run' in x): + return 1 # Ausdauersport + return -1 + + df['sport_category'] = df['Aktivitätstyp'].apply(classify_activity) + + # Nur gültige Klassen behalten + df = df[df['sport_category'] != -1].copy() + + # Lesbare Labels für Plots und LabelEncoder + df['sport_label'] = df['sport_category'].map({ + 0: 'Kraftsport', + 1: 'Ausdauersport' + }) + + return df + +# --- Explorative Plots: Countplot, Scatterplot, Korrelations-Heatmap --- +def exploratory_plots(df): + plt.figure(figsize=(6, 4)) + sns.countplot(x='sport_label', data=df) + plt.title('Verteilung der Sportarten') + plt.xlabel('Sportart') + plt.ylabel('Anzahl') + plt.tight_layout() + plt.savefig('class_countplot.png', dpi=300, bbox_inches='tight') + plt.show() + + available_numeric = [f for f in NUMERIC_FEATURES if f in df.columns] + if len(available_numeric) >= 2: + feat_x = available_numeric[0] + feat_y = available_numeric[1] + + # in numerisch umwandeln für Plot + df_plot = df.copy() + for col in [feat_x, feat_y]: + df_plot[col] = pd.to_numeric(df_plot[col], errors='coerce') + + plt.figure(figsize=(6, 5)) + sns.scatterplot(data=df_plot, x=feat_x, y=feat_y, hue='sport_label') + plt.title(f'Scatterplot: {feat_x} vs. {feat_y}') + plt.tight_layout() + plt.savefig('scatterplot_features.png', dpi=300, bbox_inches='tight') + plt.show() + + if len(available_numeric) > 1: + df_corr = df.copy() + for col in available_numeric: + df_corr[col] = pd.to_numeric(df_corr[col], errors='coerce') + + corr = df_corr[available_numeric].corr() + plt.figure(figsize=(8, 6)) + sns.heatmap(corr, annot=True, fmt=".2f", cmap='coolwarm') + plt.title('Korrelations-Heatmap der numerischen Features') + plt.tight_layout() + plt.savefig('features_correlation_heatmap.png', dpi=300, bbox_inches='tight') + plt.show() + +# --- Feature Engineering (inkl. LabelEncoder & Imputer) --- +def feature_engineering(df): + df_processed = df.copy() + + # Numerische Features bereinigen/konvertieren + for feature in NUMERIC_FEATURES: + if feature in df_processed.columns: + if df_processed[feature].dtype == 'object': + df_processed[feature] = df_processed[feature].astype(str).str.replace(',', '') + df_processed[feature] = pd.to_numeric(df_processed[feature], errors='coerce') + + # Zeitbasierte Features aus Datum + if 'Datum' in df_processed.columns: + df_processed['Datum'] = pd.to_datetime(df_processed['Datum'], errors='coerce') + df_processed['hour'] = df_processed['Datum'].dt.hour + df_processed['day_of_week'] = df_processed['Datum'].dt.dayofweek + df_processed['month'] = df_processed['Datum'].dt.month + time_features = ['hour', 'day_of_week', 'month'] + else: + time_features = [] + + # Alle Features, die ins Modell gehen + feature_names = [f for f in NUMERIC_FEATURES if f in df_processed.columns] + time_features + + X = df_processed[feature_names] + + # LabelEncoder für sport_label (Kraftsport/Ausdauersport) + label_encoder = LabelEncoder() + y = label_encoder.fit_transform(df_processed['sport_label']) + + # Imputer für fehlende Werte + imputer = SimpleImputer(strategy='median') + X_imputed = imputer.fit_transform(X) + + return X_imputed, y, feature_names, imputer, label_encoder + + +# --- SVM-Training mit Hyperparameter-Tuning --- +def train_svm_model(X, y): + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y) + + scaler = StandardScaler() + X_train_scaled = scaler.fit_transform(X_train) + X_test_scaled = scaler.transform(X_test) + + param_grid = {'C': [0.1, 1, 10, 100], 'gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1], 'kernel': ['rbf', 'linear']} + + svm = SVC(random_state=42) + grid_search = GridSearchCV(svm, param_grid, cv=5, scoring='accuracy', n_jobs=-1) + grid_search.fit(X_train_scaled, y_train) + + best_svm = grid_search.best_estimator_ + + y_pred = best_svm.predict(X_test_scaled) + + cv_scores = cross_val_score(best_svm, X_train_scaled, y_train, cv=5) + + return best_svm, X_test_scaled, y_test, y_pred, grid_search.best_params_, cv_scores, scaler + +# --- Evaluation --- +def evaluate_model(y_test, y_pred, cv_scores, best_params, label_encoder): + print("=== SVM-Modell für Sportarten-Klassifikation ===") + print(f"Anzahl der Test-Datensätze: {len(y_test)}") + print(f"\nBeste Hyperparameter: {best_params}") + + print(f"\nKreuzvalidierung (CV-Scores): {cv_scores}") + print(f"Mittlere CV-Genauigkeit: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})") + + test_acc = accuracy_score(y_test, y_pred) + print(f"\nTest-Genauigkeit: {test_acc:.4f}") + + class_names = list(label_encoder.classes_) # z.B. ['Ausdauersport', 'Kraftsport'] + + print("\nKlassifikationsbericht:") + print(classification_report(y_test, y_pred, target_names=class_names)) + + print("\nKonfusionsmatrix:") + cm = confusion_matrix(y_test, y_pred) + print(cm) + + # Heatmap der Konfusionsmatrix + plt.figure(figsize=(6, 5)) + sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_names, yticklabels=class_names) + plt.title('Konfusionsmatrix - SVM Sportarten-Klassifikation') + plt.ylabel('Wahre Klasse') + plt.xlabel('Vorhergesagte Klasse') + plt.tight_layout() + plt.savefig('confusion_matrix.png', dpi=300, bbox_inches='tight') + plt.show() + + return test_acc + +# --- Feature-Wichtigkeit (nur lineare SVM) --- + +def feature_importance_analysis(model, X_test, feature_names): + if hasattr(model, 'kernel') and model.kernel == 'linear': + importance = np.abs(model.coef_[0]) + feature_importance = (pd.DataFrame({'feature': feature_names, 'importance': importance}).sort_values('importance', ascending=False)) + + print("\nFeature-Wichtigkeit (lineare SVM):") + print(feature_importance) + + plt.figure(figsize=(10, 6)) + sns.barplot(data=feature_importance.head(10), x='importance', y='feature') + plt.title('Top 10 Feature-Wichtigkeiten') + plt.xlabel('Wichtigkeit') + plt.tight_layout() + plt.savefig('feature_importance.png', dpi=300, bbox_inches='tight') + plt.show() + else: + print("\nFeature-Wichtigkeit nur für lineare SVM verfügbar (aktueller Kernel ist nicht 'linear').") + + + +def main(): + print("Starte SVM-Modell für Sportarten-Klassifikation...") + + try: + df = load_dataframe() + if df is None: + return None, None, None, None, None + + df = preprocess_dataframe(df) + print(f"Daten geladen und vorverarbeitet: {df.shape[0]} Datensätze.") + print("Verteilung der Sportarten:") + print(df['sport_label'].value_counts()) + + # Explorative Plots: Countplot, Scatterplot, Korrelations-Heatmap + exploratory_plots(df) + + # Feature Engineering + X, y, feature_names, imputer, label_encoder = feature_engineering(df) + print(f"Feature Engineering abgeschlossen: {X.shape[1]} Features.") + + # SVM-Modell trainieren + model, X_test, y_test, y_pred, best_params, cv_scores, scaler = train_svm_model(X, y) + + # Evaluation + accuracy = evaluate_model(y_test, y_pred, cv_scores, best_params, label_encoder) + + # Feature-Wichtigkeiten (falls linear) + feature_importance_analysis(model, X_test, feature_names) + + print(f"\nModell erfolgreich trainiert mit Genauigkeit: {accuracy:.4f}") + print("\nGespeicherte Dateien:") + print("- class_countplot.png") + print("- scatterplot_features.png") + print("- features_correlation_heatmap.png") + print("- confusion_matrix.png") + print("- feature_importance.png (falls lineare SVM)") + + return model, scaler, imputer, feature_names, label_encoder + + except Exception as e: + print(f"Fehler bei der Ausführung: {str(e)}") + return None, None, None, None, None + +if __name__ == "__main__": + model, scaler, imputer, feature_names, label_encoder = main()