From 00caa51d2142cce43a190acf34f2f17a8f8c3f37 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adu=20Alex=20G=C3=B6llnitz?= Date: Thu, 4 Dec 2025 08:15:33 +0100 Subject: [PATCH] =?UTF-8?q?svm=5Fmodell.py=20gel=C3=B6scht?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- svm_modell.py | 196 -------------------------------------------------- 1 file changed, 196 deletions(-) delete mode 100644 svm_modell.py diff --git a/svm_modell.py b/svm_modell.py deleted file mode 100644 index 3e40cdb..0000000 --- a/svm_modell.py +++ /dev/null @@ -1,196 +0,0 @@ -# Machine Learning Modell 1 / Leistungsnachweis II Data Science / 1. Semester -# Support Vector Machine zu binären Klassifikation - -import pandas as pd -import numpy as np -from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV -from sklearn.preprocessing import StandardScaler, LabelEncoder -from sklearn.svm import SVC -from sklearn.metrics import classification_report, confusion_matrix, accuracy_score -from sklearn.impute import SimpleImputer -import matplotlib.pyplot as plt -import seaborn as sns -from datetime import datetime -import warnings -warnings.filterwarnings('ignore') - -def load_and_preprocess_data(): - activities = pd.read_csv("Activities_rohdaten.csv", sep=None, engine="python") - df = activities.copy() - df.columns = df.columns.str.strip() - -# Kategorisieren der Sportklassen - def classify_activity(x): - x = str(x).lower() - if 'cardio' in x or 'gehen' in x or 'multisport' in x: - return -1 - if 'kraft' in x: - return 0 - if ('rad' in x or 'bike' in x or 'cycling' in x or 'velo' in x or 'schwimm' in x or 'laufen' in x or 'run' in x): - return 1 - return -1 - - df['sport_category'] = df['Aktivitätstyp'].apply(classify_activity) - - df = df[df['sport_category'] != -1] - -# Numerische Daten welche miteinbezogen werden - numeric_features = ['Distanz', 'Kalorien', 'Ø Herzfrequenz', 'Maximale Herzfrequenz', 'Aerober TE', 'Training Stress Score®'] - - available_features = [col for col in numeric_features if col in df.columns] - - return df, available_features - - -def feature_engineering(df, features): - - # Kopie erstellen - df_processed = df.copy() - - # Numerische Features bereinigen - for feature in features: - if df_processed[feature].dtype == 'object': - df_processed[feature] = df_processed[feature].astype(str).str.replace(',', '') - - # In numerische Werte umwandeln - df_processed[feature] = pd.to_numeric(df_processed[feature], errors='coerce') - - # Zeitbasierte Features aus Datum extrahieren - if 'Datum' in df_processed.columns: - df_processed['Datum'] = pd.to_datetime(df_processed['Datum'], errors='coerce') - df_processed['hour'] = df_processed['Datum'].dt.hour - df_processed['day_of_week'] = df_processed['Datum'].dt.dayofweek - df_processed['month'] = df_processed['Datum'].dt.month - - # Zeitbasierte Features hinzufügen - time_features = ['hour', 'day_of_week', 'month'] - features.extend(time_features) - - # Fehlende Werte behandeln - X = df_processed[features] - y = df_processed['sport_category'] - - # SimpleImputer für fehlende Werte - imputer = SimpleImputer(strategy='median') - X_imputed = imputer.fit_transform(X) - - return X_imputed, y, features, imputer - -def train_svm_model(X, y): # Trainiert das SVM-Modell mit Hyperparameter-Tuning - - # Daten in Trainings- und Testsets aufteilen - X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y) - - # Features skalieren - scaler = StandardScaler() - X_train_scaled = scaler.fit_transform(X_train) - X_test_scaled = scaler.transform(X_test) - - # SVM-Modell mit Hyperparameter-Tuning - param_grid = { - 'C': [0.1, 1, 10, 100], - 'gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1], - 'kernel': ['rbf', 'linear'] - } - - svm = SVC(random_state=42) - grid_search = GridSearchCV(svm, param_grid, cv=5, scoring='accuracy', n_jobs=-1) - grid_search.fit(X_train_scaled, y_train) - - # Bestes Modell - best_svm = grid_search.best_estimator_ - - # Vorhersagen - y_pred = best_svm.predict(X_test_scaled) - - # Kreuzvalidierung - cv_scores = cross_val_score(best_svm, X_train_scaled, y_train, cv=5) - - return best_svm, X_test_scaled, y_test, y_pred, grid_search.best_params_, cv_scores, scaler - -def evaluate_model(y_test, y_pred, cv_scores, best_params): # Evaluiert das Modell und gibt Ergebnisse aus - - print("=== SVM-Modell für Sportarten-Klassifikation ===") - print(f"Anzahl der Datensätze: {len(y_test) + len(y_pred)}") - print(f"\nBeste Hyperparameter: {best_params}") - - print(f"\nKreuzvalidierung (CV-Scores): {cv_scores}") - print(f"Mittlere CV-Genauigkeit: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})") - - print(f"\nTest-Genauigkeit: {accuracy_score(y_test, y_pred):.4f}") - - print("\nKlassifikationsbericht:") - print(classification_report(y_test, y_pred, target_names=['Kraftsport', 'Ausdauersport'])) - - print("\nKonfusionsmatrix:") - cm = confusion_matrix(y_test, y_pred) - print(cm) - - # Visualisierung der Konfusionsmatrix - plt.figure(figsize=(8, 6)) - sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', - xticklabels=['Kraftsport', 'Ausdauersport'], - yticklabels=['Kraftsport', 'Ausdauersport']) - plt.title('Konfusionsmatrix - SVM Sportarten-Klassifikation') - plt.ylabel('Wahre Klasse') - plt.xlabel('Vorhergesagte Klasse') - plt.tight_layout() - plt.savefig('confusion_matrix.png', dpi=300, bbox_inches='tight') - plt.show() - - return accuracy_score(y_test, y_pred) - -def feature_importance_analysis(model, X_test, feature_names): # Analysiert die Wichtigkeit der Features (für lineare SVM) - - if model.kernel == 'linear': - importance = np.abs(model.coef_[0]) # Feature-Wichtigkeit für lineare SVM - feature_importance = pd.DataFrame({'feature': feature_names, 'importance': importance}).sort_values('importance', ascending=False) - - print("\nFeature-Wichtigkeit (lineare SVM):") - print(feature_importance) - - plt.figure(figsize=(10, 6)) - sns.barplot(data=feature_importance.head(10), x='importance', y='feature') - plt.title('Top 10 Feature-Wichtigkeiten') - plt.xlabel('Wichtigkeit') - plt.tight_layout() - plt.savefig('feature_importance.png', dpi=300, bbox_inches='tight') - plt.show() - else: - print("\nFeature-Wichtigkeit nur für lineare SVM verfügbar") - - -def main(): - - print("Starte SVM-Modell für Sportarten-Klassifikation...") - - try: - # Daten laden und vorverarbeiten - df, features = load_and_preprocess_data() - print(f"Daten geladen: {df.shape[0]} Datensätze, {len(features)} Features") - print(f"Verteilung der Sportarten: {df['sport_category'].value_counts().to_dict()}") - - # Feature Engineering - X, y, feature_names, imputer = feature_engineering(df, features) - print(f"Feature Engineering abgeschlossen: {X.shape[1]} Features") - - # SVM-Modell trainieren - model, X_test, y_test, y_pred, best_params, cv_scores, scaler = train_svm_model(X, y) - - # Modell evaluieren - accuracy = evaluate_model(y_test, y_pred, cv_scores, best_params) - - # Feature-Analyse - feature_importance_analysis(model, X_test, feature_names) - - print(f"\nModell erfolgreich trainiert mit Genauigkeit: {accuracy:.4f}") - print("\nGespicherte Dateien: confusion_matrix.png, feature_importance.png") - - return model, scaler, imputer, feature_names - - except Exception as e: - print(f"Fehler bei der Ausführung: {str(e)}") - return None, None, None, None # 4x None, weill 4 Rückgabewerte vorhanden sind (model, scaler, inputer & feature_names) - -if __name__ == "__main__": - model, scaler, imputer, feature_names = main() \ No newline at end of file