# Machine Learning Modell 1 / Leistungsnachweis II Data Science / 1. Semester # Support Vector Machine zu binären Klassifikation import pandas as pd import numpy as np from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV from sklearn.preprocessing import StandardScaler, LabelEncoder from sklearn.svm import SVC from sklearn.metrics import classification_report, confusion_matrix, accuracy_score from sklearn.impute import SimpleImputer import matplotlib.pyplot as plt import seaborn as sns from datetime import datetime import warnings warnings.filterwarnings('ignore') def load_and_preprocess_data(): # Lädt und bereitet die Daten aus verschiedenen CSV-Dateien vor # CSV-Dateien laden activities = pd.read_csv("Activities_rohdaten.csv", sep=None, engine="python") # Daten bereinigen und konsolidieren df = activities.copy() # Spaltennamen bereinigen df.columns = df.columns.str.strip() # Sportarten für binäre Klassifikation definieren # Wir klassifizieren zwischen 'Ausdauersport' und 'Kraftsport' endurance_sports = ['Laufen', 'Rennradfahren', 'Schwimmen', 'Radfahren'] strength_sports = ['Krafttraining'] # Binäre Zielvariable erstellen df['sport_category'] = df['Aktivitätstyp'].apply(lambda x: 1 if x in endurance_sports else (0 if x in strength_sports else -1)) # Nur gültige Kategorien behalten df = df[df['sport_category'] != -1] # Features auswählen, die für die Klassifikation relevant sind numeric_features = ['Distanz', 'Kalorien', 'Ø Herzfrequenz', 'Maximale Herzfrequenz', 'Aerober TE', 'Training Stress Score®'] # Nur verfügbare numerische Features auswählen available_features = [col for col in numeric_features if col in df.columns] return df, available_features def feature_engineering(df, features): # Führt Feature Engineering durch # Kopie erstellen df_processed = df.copy() # Numerische Features bereinigen for feature in features: if df_processed[feature].dtype == 'object': df_processed[feature] = df_processed[feature].astype(str).str.replace(',', '') # In numerische Werte umwandeln df_processed[feature] = pd.to_numeric(df_processed[feature], errors='coerce') # Zeitbasierte Features aus Datum extrahieren if 'Datum' in df_processed.columns: df_processed['Datum'] = pd.to_datetime(df_processed['Datum'], errors='coerce') df_processed['hour'] = df_processed['Datum'].dt.hour df_processed['day_of_week'] = df_processed['Datum'].dt.dayofweek df_processed['month'] = df_processed['Datum'].dt.month # Zeitbasierte Features hinzufügen time_features = ['hour', 'day_of_week', 'month'] features.extend(time_features) # Fehlende Werte behandeln X = df_processed[features] y = df_processed['sport_category'] # SimpleImputer für fehlende Werte imputer = SimpleImputer(strategy='median') X_imputed = imputer.fit_transform(X) return X_imputed, y, features, imputer def train_svm_model(X, y): # Trainiert das SVM-Modell mit Hyperparameter-Tuning # Daten in Trainings- und Testsets aufteilen X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y) # Features skalieren scaler = StandardScaler() X_train_scaled = scaler.fit_transform(X_train) X_test_scaled = scaler.transform(X_test) # SVM-Modell mit Hyperparameter-Tuning param_grid = { 'C': [0.1, 1, 10, 100], 'gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1], 'kernel': ['rbf', 'linear'] } svm = SVC(random_state=42) grid_search = GridSearchCV(svm, param_grid, cv=5, scoring='accuracy', n_jobs=-1) grid_search.fit(X_train_scaled, y_train) # Bestes Modell best_svm = grid_search.best_estimator_ # Vorhersagen y_pred = best_svm.predict(X_test_scaled) # Kreuzvalidierung cv_scores = cross_val_score(best_svm, X_train_scaled, y_train, cv=5) return best_svm, X_test_scaled, y_test, y_pred, grid_search.best_params_, cv_scores, scaler def evaluate_model(y_test, y_pred, cv_scores, best_params): # Evaluiert das Modell und gibt Ergebnisse aus print("=== SVM-Modell für Sportarten-Klassifikation ===") print(f"Anzahl der Datensätze: {len(y_test) + len(y_pred)}") print(f"\nBeste Hyperparameter: {best_params}") print(f"\nKreuzvalidierung (CV-Scores): {cv_scores}") print(f"Mittlere CV-Genauigkeit: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})") print(f"\nTest-Genauigkeit: {accuracy_score(y_test, y_pred):.4f}") print("\nKlassifikationsbericht:") print(classification_report(y_test, y_pred, target_names=['Kraftsport', 'Ausdauersport'])) print("\nKonfusionsmatrix:") cm = confusion_matrix(y_test, y_pred) print(cm) # Visualisierung der Konfusionsmatrix plt.figure(figsize=(8, 6)) sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Kraftsport', 'Ausdauersport'], yticklabels=['Kraftsport', 'Ausdauersport']) plt.title('Konfusionsmatrix - SVM Sportarten-Klassifikation') plt.ylabel('Wahre Klasse') plt.xlabel('Vorhergesagte Klasse') plt.tight_layout() plt.savefig('confusion_matrix.png', dpi=300, bbox_inches='tight') plt.show() return accuracy_score(y_test, y_pred) def feature_importance_analysis(model, X_test, feature_names): # Analysiert die Wichtigkeit der Features (für lineare SVM) if model.kernel == 'linear': # Feature-Wichtigkeit für lineare SVM importance = np.abs(model.coef_[0]) feature_importance = pd.DataFrame({'feature': feature_names, 'importance': importance}).sort_values('importance', ascending=False) print("\nFeature-Wichtigkeit (lineare SVM):") print(feature_importance) # Visualisierung plt.figure(figsize=(10, 6)) sns.barplot(data=feature_importance.head(10), x='importance', y='feature') plt.title('Top 10 Feature-Wichtigkeiten') plt.xlabel('Wichtigkeit') plt.tight_layout() plt.savefig('feature_importance.png', dpi=300, bbox_inches='tight') plt.show() else: print("\nFeature-Wichtigkeit nur für lineare SVM verfügbar") def main(): print("Starte SVM-Modell für Sportarten-Klassifikation...") try: # 1. Daten laden und vorverarbeiten df, features = load_and_preprocess_data() print(f"Daten geladen: {df.shape[0]} Datensätze, {len(features)} Features") print(f"Verteilung der Sportarten: {df['sport_category'].value_counts().to_dict()}") # 2. Feature Engineering X, y, feature_names, imputer = feature_engineering(df, features) print(f"Feature Engineering abgeschlossen: {X.shape[1]} Features") # 3. SVM-Modell trainieren model, X_test, y_test, y_pred, best_params, cv_scores, scaler = train_svm_model(X, y) # 4. Modell evaluieren accuracy = evaluate_model(y_test, y_pred, cv_scores, best_params) # 5. Feature-Analyse feature_importance_analysis(model, X_test, feature_names) print(f"\nModell erfolgreich trainiert mit Genauigkeit: {accuracy:.4f}") print("\nGespicherte Dateien: confusion_matrix.png, feature_importance.png") return model, scaler, imputer, feature_names except Exception as e: print(f"Fehler bei der Ausführung: {str(e)}") return None, None, None, None if __name__ == "__main__": model, scaler, imputer, feature_names = main()