From a545950451cb7919a3504f49167faa5eb7ae4d64 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adu=20Alex=20G=C3=B6llnitz?= Date: Thu, 4 Dec 2025 08:15:27 +0100 Subject: [PATCH] =?UTF-8?q?svm=5Fmodell=5F2.py=20gel=C3=B6scht?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- svm_modell_2.py | 255 ------------------------------------------------ 1 file changed, 255 deletions(-) delete mode 100644 svm_modell_2.py diff --git a/svm_modell_2.py b/svm_modell_2.py deleted file mode 100644 index 730cbdb..0000000 --- a/svm_modell_2.py +++ /dev/null @@ -1,255 +0,0 @@ -# Support Vector Machine Version 2.0 - -import pandas as pd -import numpy as np -import matplotlib.pyplot as plt -import seaborn as sns -from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV -from sklearn.preprocessing import StandardScaler, LabelEncoder -from sklearn.svm import SVC -from sklearn.metrics import classification_report, confusion_matrix, accuracy_score -from sklearn.impute import SimpleImputer -import warnings -warnings.filterwarnings('ignore') - - - - -FEATURES = ['Distanz', 'Kalorien', 'Ø Herzfrequenz', 'Maximale Herzfrequenz', 'Aerober TE', 'Training Stress Score®', 'Aktivitätstyp'] - -NUMERIC_FEATURES = ['Distanz', 'Kalorien', 'Ø Herzfrequenz', 'Maximale Herzfrequenz', 'Aerober TE', 'Training Stress Score®'] - - - -def load_dataframe(): - try: - df = pd.read_csv("Activities_rohdaten.csv", usecols=FEATURES) - return df - except FileNotFoundError: - print("Datei 'Activities_rohdaten.csv' nicht gefunden.") - return None - -# --- Sportart-Klassifikation & Labels --- -def preprocess_dataframe(df): - - def classify_activity(x): - x = str(x).lower() - if 'kraft' in x: - return 0 # Kraftsport - if ('rad' in x or 'bike' in x or 'cycling' in x or 'velo' in x or 'schwimm' in x or 'laufen' in x or 'run' in x): - return 1 # Ausdauersport - return -1 - - df['sport_category'] = df['Aktivitätstyp'].apply(classify_activity) - - # Nur gültige Klassen behalten - df = df[df['sport_category'] != -1].copy() - - # Lesbare Labels für Plots und LabelEncoder - df['sport_label'] = df['sport_category'].map({ - 0: 'Kraftsport', - 1: 'Ausdauersport' - }) - - return df - -# --- Explorative Plots: Countplot, Scatterplot, Korrelations-Heatmap --- -def exploratory_plots(df): - plt.figure(figsize=(6, 4)) - sns.countplot(x='sport_label', data=df) - plt.title('Verteilung der Sportarten') - plt.xlabel('Sportart') - plt.ylabel('Anzahl') - plt.tight_layout() - plt.savefig('class_countplot.png', dpi=300, bbox_inches='tight') - plt.show() - - available_numeric = [f for f in NUMERIC_FEATURES if f in df.columns] - if len(available_numeric) >= 2: - feat_x = available_numeric[0] - feat_y = available_numeric[1] - - # in numerisch umwandeln für Plot - df_plot = df.copy() - for col in [feat_x, feat_y]: - df_plot[col] = pd.to_numeric(df_plot[col], errors='coerce') - - plt.figure(figsize=(6, 5)) - sns.scatterplot(data=df_plot, x=feat_x, y=feat_y, hue='sport_label') - plt.title(f'Scatterplot: {feat_x} vs. {feat_y}') - plt.tight_layout() - plt.savefig('scatterplot_features.png', dpi=300, bbox_inches='tight') - plt.show() - - if len(available_numeric) > 1: - df_corr = df.copy() - for col in available_numeric: - df_corr[col] = pd.to_numeric(df_corr[col], errors='coerce') - - corr = df_corr[available_numeric].corr() - plt.figure(figsize=(8, 6)) - sns.heatmap(corr, annot=True, fmt=".2f", cmap='coolwarm') - plt.title('Korrelations-Heatmap der numerischen Features') - plt.tight_layout() - plt.savefig('features_correlation_heatmap.png', dpi=300, bbox_inches='tight') - plt.show() - -# --- Feature Engineering (inkl. LabelEncoder & Imputer) --- -def feature_engineering(df): - df_processed = df.copy() - - # Numerische Features bereinigen/konvertieren - for feature in NUMERIC_FEATURES: - if feature in df_processed.columns: - if df_processed[feature].dtype == 'object': - df_processed[feature] = df_processed[feature].astype(str).str.replace(',', '') - df_processed[feature] = pd.to_numeric(df_processed[feature], errors='coerce') - - # Zeitbasierte Features aus Datum - if 'Datum' in df_processed.columns: - df_processed['Datum'] = pd.to_datetime(df_processed['Datum'], errors='coerce') - df_processed['hour'] = df_processed['Datum'].dt.hour - df_processed['day_of_week'] = df_processed['Datum'].dt.dayofweek - df_processed['month'] = df_processed['Datum'].dt.month - time_features = ['hour', 'day_of_week', 'month'] - else: - time_features = [] - - # Alle Features, die ins Modell gehen - feature_names = [f for f in NUMERIC_FEATURES if f in df_processed.columns] + time_features - - X = df_processed[feature_names] - - # LabelEncoder für sport_label (Kraftsport/Ausdauersport) - label_encoder = LabelEncoder() - y = label_encoder.fit_transform(df_processed['sport_label']) - - # Imputer für fehlende Werte - imputer = SimpleImputer(strategy='median') - X_imputed = imputer.fit_transform(X) - - return X_imputed, y, feature_names, imputer, label_encoder - - -# --- SVM-Training mit Hyperparameter-Tuning --- -def train_svm_model(X, y): - X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y) - - scaler = StandardScaler() - X_train_scaled = scaler.fit_transform(X_train) - X_test_scaled = scaler.transform(X_test) - - param_grid = {'C': [0.1, 1, 10, 100], 'gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1], 'kernel': ['rbf', 'linear']} - - svm = SVC(random_state=42) - grid_search = GridSearchCV(svm, param_grid, cv=5, scoring='accuracy', n_jobs=-1) - grid_search.fit(X_train_scaled, y_train) - - best_svm = grid_search.best_estimator_ - - y_pred = best_svm.predict(X_test_scaled) - - cv_scores = cross_val_score(best_svm, X_train_scaled, y_train, cv=5) - - return best_svm, X_test_scaled, y_test, y_pred, grid_search.best_params_, cv_scores, scaler - -# --- Evaluation --- -def evaluate_model(y_test, y_pred, cv_scores, best_params, label_encoder): - print("=== SVM-Modell für Sportarten-Klassifikation ===") - print(f"Anzahl der Test-Datensätze: {len(y_test)}") - print(f"\nBeste Hyperparameter: {best_params}") - - print(f"\nKreuzvalidierung (CV-Scores): {cv_scores}") - print(f"Mittlere CV-Genauigkeit: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})") - - test_acc = accuracy_score(y_test, y_pred) - print(f"\nTest-Genauigkeit: {test_acc:.4f}") - - class_names = list(label_encoder.classes_) # z.B. ['Ausdauersport', 'Kraftsport'] - - print("\nKlassifikationsbericht:") - print(classification_report(y_test, y_pred, target_names=class_names)) - - print("\nKonfusionsmatrix:") - cm = confusion_matrix(y_test, y_pred) - print(cm) - - # Heatmap der Konfusionsmatrix - plt.figure(figsize=(6, 5)) - sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_names, yticklabels=class_names) - plt.title('Konfusionsmatrix - SVM Sportarten-Klassifikation') - plt.ylabel('Wahre Klasse') - plt.xlabel('Vorhergesagte Klasse') - plt.tight_layout() - plt.savefig('confusion_matrix.png', dpi=300, bbox_inches='tight') - plt.show() - - return test_acc - -# --- Feature-Wichtigkeit (nur lineare SVM) --- - -def feature_importance_analysis(model, X_test, feature_names): - if hasattr(model, 'kernel') and model.kernel == 'linear': - importance = np.abs(model.coef_[0]) - feature_importance = (pd.DataFrame({'feature': feature_names, 'importance': importance}).sort_values('importance', ascending=False)) - - print("\nFeature-Wichtigkeit (lineare SVM):") - print(feature_importance) - - plt.figure(figsize=(10, 6)) - sns.barplot(data=feature_importance.head(10), x='importance', y='feature') - plt.title('Top 10 Feature-Wichtigkeiten') - plt.xlabel('Wichtigkeit') - plt.tight_layout() - plt.savefig('feature_importance.png', dpi=300, bbox_inches='tight') - plt.show() - else: - print("\nFeature-Wichtigkeit nur für lineare SVM verfügbar (aktueller Kernel ist nicht 'linear').") - - - -def main(): - print("Starte SVM-Modell für Sportarten-Klassifikation...") - - try: - df = load_dataframe() - if df is None: - return None, None, None, None, None - - df = preprocess_dataframe(df) - print(f"Daten geladen und vorverarbeitet: {df.shape[0]} Datensätze.") - print("Verteilung der Sportarten:") - print(df['sport_label'].value_counts()) - - # Explorative Plots: Countplot, Scatterplot, Korrelations-Heatmap - exploratory_plots(df) - - # Feature Engineering - X, y, feature_names, imputer, label_encoder = feature_engineering(df) - print(f"Feature Engineering abgeschlossen: {X.shape[1]} Features.") - - # SVM-Modell trainieren - model, X_test, y_test, y_pred, best_params, cv_scores, scaler = train_svm_model(X, y) - - # Evaluation - accuracy = evaluate_model(y_test, y_pred, cv_scores, best_params, label_encoder) - - # Feature-Wichtigkeiten (falls linear) - feature_importance_analysis(model, X_test, feature_names) - - print(f"\nModell erfolgreich trainiert mit Genauigkeit: {accuracy:.4f}") - print("\nGespeicherte Dateien:") - print("- class_countplot.png") - print("- scatterplot_features.png") - print("- features_correlation_heatmap.png") - print("- confusion_matrix.png") - print("- feature_importance.png (falls lineare SVM)") - - return model, scaler, imputer, feature_names, label_encoder - - except Exception as e: - print(f"Fehler bei der Ausführung: {str(e)}") - return None, None, None, None, None - -if __name__ == "__main__": - model, scaler, imputer, feature_names, label_encoder = main()