diff --git a/svm_modell.py b/svm_modell.py index 6815d9d..53ab0f7 100644 --- a/svm_modell.py +++ b/svm_modell.py @@ -1,204 +1,201 @@ -# Machine Learning Modell 1 / Leistungsnachweis II Data Science / 1. Semester -# Support Vector Machine zu binären Klassifikation - -import pandas as pd -import numpy as np -from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV -from sklearn.preprocessing import StandardScaler, LabelEncoder -from sklearn.svm import SVC -from sklearn.metrics import classification_report, confusion_matrix, accuracy_score -from sklearn.impute import SimpleImputer -import matplotlib.pyplot as plt -import seaborn as sns -from datetime import datetime -import warnings -warnings.filterwarnings('ignore') - -def load_and_preprocess_data(): - - # Lädt und bereitet die Daten aus verschiedenen CSV-Dateien vor - - # CSV-Dateien laden - activities = pd.read_csv("Activities_rohdaten.csv", sep=None, engine="python") - - # Daten bereinigen und konsolidieren - df = activities.copy() - - # Spaltennamen bereinigen - df.columns = df.columns.str.strip() - - # Sportarten für binäre Klassifikation definieren - - # Wir klassifizieren zwischen 'Ausdauersport' und 'Kraftsport' - endurance_sports = ['Laufen', 'Rennradfahren', 'Schwimmen', 'Radfahren'] - strength_sports = ['Krafttraining'] - - # Binäre Zielvariable erstellen - df['sport_category'] = df['Aktivitätstyp'].apply(lambda x: 1 - if x in endurance_sports - else (0 if x in strength_sports else -1)) - - # Nur gültige Kategorien behalten - df = df[df['sport_category'] != -1] - - # Features auswählen, die für die Klassifikation relevant sind - numeric_features = ['Distanz', 'Kalorien', 'Ø Herzfrequenz', 'Maximale Herzfrequenz', 'Aerober TE', 'Training Stress Score®'] - - # Nur verfügbare numerische Features auswählen - available_features = [col for col in numeric_features if col in df.columns] - - return df, available_features - -def feature_engineering(df, features): # Führt Feature Engineering durch - - # Kopie erstellen - df_processed = df.copy() - - # Numerische Features bereinigen - for feature in features: - if df_processed[feature].dtype == 'object': - df_processed[feature] = df_processed[feature].astype(str).str.replace(',', '') - - # In numerische Werte umwandeln - df_processed[feature] = pd.to_numeric(df_processed[feature], errors='coerce') - - # Zeitbasierte Features aus Datum extrahieren - if 'Datum' in df_processed.columns: - df_processed['Datum'] = pd.to_datetime(df_processed['Datum'], errors='coerce') - df_processed['hour'] = df_processed['Datum'].dt.hour - df_processed['day_of_week'] = df_processed['Datum'].dt.dayofweek - df_processed['month'] = df_processed['Datum'].dt.month - - # Zeitbasierte Features hinzufügen - time_features = ['hour', 'day_of_week', 'month'] - features.extend(time_features) - - # Fehlende Werte behandeln - X = df_processed[features] - y = df_processed['sport_category'] - - # SimpleImputer für fehlende Werte - imputer = SimpleImputer(strategy='median') - X_imputed = imputer.fit_transform(X) - - return X_imputed, y, features, imputer - -def train_svm_model(X, y): # Trainiert das SVM-Modell mit Hyperparameter-Tuning - - # Daten in Trainings- und Testsets aufteilen - X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y) - - # Features skalieren - scaler = StandardScaler() - X_train_scaled = scaler.fit_transform(X_train) - X_test_scaled = scaler.transform(X_test) - - # SVM-Modell mit Hyperparameter-Tuning - param_grid = { - 'C': [0.1, 1, 10, 100], - 'gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1], - 'kernel': ['rbf', 'linear'] - } - - svm = SVC(random_state=42) - grid_search = GridSearchCV(svm, param_grid, cv=5, scoring='accuracy', n_jobs=-1) - grid_search.fit(X_train_scaled, y_train) - - # Bestes Modell - best_svm = grid_search.best_estimator_ - - # Vorhersagen - y_pred = best_svm.predict(X_test_scaled) - - # Kreuzvalidierung - cv_scores = cross_val_score(best_svm, X_train_scaled, y_train, cv=5) - - return best_svm, X_test_scaled, y_test, y_pred, grid_search.best_params_, cv_scores, scaler - -def evaluate_model(y_test, y_pred, cv_scores, best_params): # Evaluiert das Modell und gibt Ergebnisse aus - - print("=== SVM-Modell für Sportarten-Klassifikation ===") - print(f"Anzahl der Datensätze: {len(y_test) + len(y_pred)}") - print(f"\nBeste Hyperparameter: {best_params}") - - print(f"\nKreuzvalidierung (CV-Scores): {cv_scores}") - print(f"Mittlere CV-Genauigkeit: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})") - - print(f"\nTest-Genauigkeit: {accuracy_score(y_test, y_pred):.4f}") - - print("\nKlassifikationsbericht:") - print(classification_report(y_test, y_pred, target_names=['Kraftsport', 'Ausdauersport'])) - - print("\nKonfusionsmatrix:") - cm = confusion_matrix(y_test, y_pred) - print(cm) - - # Visualisierung der Konfusionsmatrix - plt.figure(figsize=(8, 6)) - sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', - xticklabels=['Kraftsport', 'Ausdauersport'], - yticklabels=['Kraftsport', 'Ausdauersport']) - plt.title('Konfusionsmatrix - SVM Sportarten-Klassifikation') - plt.ylabel('Wahre Klasse') - plt.xlabel('Vorhergesagte Klasse') - plt.tight_layout() - plt.savefig('confusion_matrix.png', dpi=300, bbox_inches='tight') - plt.show() - - return accuracy_score(y_test, y_pred) - -def feature_importance_analysis(model, X_test, feature_names): # Analysiert die Wichtigkeit der Features (für lineare SVM) - - if model.kernel == 'linear': - # Feature-Wichtigkeit für lineare SVM - importance = np.abs(model.coef_[0]) - feature_importance = pd.DataFrame({'feature': feature_names, 'importance': importance}).sort_values('importance', ascending=False) - - print("\nFeature-Wichtigkeit (lineare SVM):") - print(feature_importance) - - # Visualisierung - plt.figure(figsize=(10, 6)) - sns.barplot(data=feature_importance.head(10), x='importance', y='feature') - plt.title('Top 10 Feature-Wichtigkeiten') - plt.xlabel('Wichtigkeit') - plt.tight_layout() - plt.savefig('feature_importance.png', dpi=300, bbox_inches='tight') - plt.show() - else: - print("\nFeature-Wichtigkeit nur für lineare SVM verfügbar") - -def main(): - - print("Starte SVM-Modell für Sportarten-Klassifikation...") - - try: - # 1. Daten laden und vorverarbeiten - df, features = load_and_preprocess_data() - print(f"Daten geladen: {df.shape[0]} Datensätze, {len(features)} Features") - print(f"Verteilung der Sportarten: {df['sport_category'].value_counts().to_dict()}") - - # 2. Feature Engineering - X, y, feature_names, imputer = feature_engineering(df, features) - print(f"Feature Engineering abgeschlossen: {X.shape[1]} Features") - - # 3. SVM-Modell trainieren - model, X_test, y_test, y_pred, best_params, cv_scores, scaler = train_svm_model(X, y) - - # 4. Modell evaluieren - accuracy = evaluate_model(y_test, y_pred, cv_scores, best_params) - - # 5. Feature-Analyse - feature_importance_analysis(model, X_test, feature_names) - - print(f"\nModell erfolgreich trainiert mit Genauigkeit: {accuracy:.4f}") - print("\nGespicherte Dateien: confusion_matrix.png, feature_importance.png") - - return model, scaler, imputer, feature_names - - except Exception as e: - print(f"Fehler bei der Ausführung: {str(e)}") - return None, None, None, None - -if __name__ == "__main__": +# Machine Learning Modell 1 / Leistungsnachweis II Data Science / 1. Semester +# Support Vector Machine zu binären Klassifikation + +import pandas as pd +import numpy as np +from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV +from sklearn.preprocessing import StandardScaler, LabelEncoder +from sklearn.svm import SVC +from sklearn.metrics import classification_report, confusion_matrix, accuracy_score +from sklearn.impute import SimpleImputer +import matplotlib.pyplot as plt +import seaborn as sns +from datetime import datetime +import warnings +warnings.filterwarnings('ignore') + +def load_and_preprocess_data(): + + # CSV-Dateien laden + activities = pd.read_csv("Activities_rohdaten.csv", sep=None, engine="python") + + # Daten bereinigen und konsolidieren + df = activities.copy() + + # Spaltennamen bereinigen + df.columns = df.columns.str.strip() + + # Sportarten für binäre Klassifikation definieren + # Wir klassifizieren zwischen 'Ausdauersport' und 'Kraftsport' + endurance_sports = ['Laufen', 'Rennradfahren', 'Schwimmen', 'Radfahren'] + strength_sports = ['Krafttraining'] + + # Binäre Zielvariable erstellen + df['sport_category'] = df['Aktivitätstyp'].apply(lambda x: 1 + if x in endurance_sports + else (0 if x in strength_sports else -1)) + + # Nur gültige Kategorien behalten + df = df[df['sport_category'] != -1] + + # Features auswählen: + numeric_features = ['Distanz', 'Kalorien', 'Ø Herzfrequenz', 'Maximale Herzfrequenz', 'Aerober TE', 'Training Stress Score®'] + + # Nur verfügbare numerische Features auswählen + available_features = [col for col in numeric_features if col in df.columns] + + return df, available_features + +def feature_engineering(df, features): # Führt Feature Engineering durch + + # Kopie erstellen + df_processed = df.copy() + + # Numerische Features bereinigen + for feature in features: + if df_processed[feature].dtype == 'object': + df_processed[feature] = df_processed[feature].astype(str).str.replace(',', '') + + # In numerische Werte umwandeln + df_processed[feature] = pd.to_numeric(df_processed[feature], errors='coerce') + + # Zeitbasierte Features aus Datum extrahieren + if 'Datum' in df_processed.columns: + df_processed['Datum'] = pd.to_datetime(df_processed['Datum'], errors='coerce') + df_processed['hour'] = df_processed['Datum'].dt.hour + df_processed['day_of_week'] = df_processed['Datum'].dt.dayofweek + df_processed['month'] = df_processed['Datum'].dt.month + + # Zeitbasierte Features hinzufügen + time_features = ['hour', 'day_of_week', 'month'] + features.extend(time_features) + + # Fehlende Werte behandeln + X = df_processed[features] + y = df_processed['sport_category'] + + # SimpleImputer für fehlende Werte + imputer = SimpleImputer(strategy='median') + X_imputed = imputer.fit_transform(X) + + return X_imputed, y, features, imputer + +def train_svm_model(X, y): # Trainiert das SVM-Modell mit Hyperparameter-Tuning + + # Daten in Trainings- und Testsets aufteilen + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y) + + # Features skalieren + scaler = StandardScaler() + X_train_scaled = scaler.fit_transform(X_train) + X_test_scaled = scaler.transform(X_test) + + # SVM-Modell mit Hyperparameter-Tuning + param_grid = { + 'C': [0.1, 1, 10, 100], + 'gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1], + 'kernel': ['rbf', 'linear'] + } + + svm = SVC(random_state=42) + grid_search = GridSearchCV(svm, param_grid, cv=5, scoring='accuracy', n_jobs=-1) + grid_search.fit(X_train_scaled, y_train) + + # Bestes Modell + best_svm = grid_search.best_estimator_ + + # Vorhersagen + y_pred = best_svm.predict(X_test_scaled) + + # Kreuzvalidierung + cv_scores = cross_val_score(best_svm, X_train_scaled, y_train, cv=5) + + return best_svm, X_test_scaled, y_test, y_pred, grid_search.best_params_, cv_scores, scaler + +def evaluate_model(y_test, y_pred, cv_scores, best_params): # Evaluiert das Modell und gibt Ergebnisse aus + + print("=== SVM-Modell für Sportarten-Klassifikation ===") + print(f"Anzahl der Datensätze: {len(y_test) + len(y_pred)}") + print(f"\nBeste Hyperparameter: {best_params}") + + print(f"\nKreuzvalidierung (CV-Scores): {cv_scores}") + print(f"Mittlere CV-Genauigkeit: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})") + + print(f"\nTest-Genauigkeit: {accuracy_score(y_test, y_pred):.4f}") + + print("\nKlassifikationsbericht:") + print(classification_report(y_test, y_pred, target_names=['Kraftsport', 'Ausdauersport'])) + + print("\nKonfusionsmatrix:") + cm = confusion_matrix(y_test, y_pred) + print(cm) + + # Visualisierung der Konfusionsmatrix + plt.figure(figsize=(8, 6)) + sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', + xticklabels=['Kraftsport', 'Ausdauersport'], + yticklabels=['Kraftsport', 'Ausdauersport']) + plt.title('Konfusionsmatrix - SVM Sportarten-Klassifikation') + plt.ylabel('Wahre Klasse') + plt.xlabel('Vorhergesagte Klasse') + plt.tight_layout() + plt.savefig('confusion_matrix.png', dpi=300, bbox_inches='tight') + plt.show() + + return accuracy_score(y_test, y_pred) + +def feature_importance_analysis(model, X_test, feature_names): # Analysiert die Wichtigkeit der Features (für lineare SVM) + + if model.kernel == 'linear': + # Feature-Wichtigkeit für lineare SVM + importance = np.abs(model.coef_[0]) + feature_importance = pd.DataFrame({'feature': feature_names, 'importance': importance}).sort_values('importance', ascending=False) + + print("\nFeature-Wichtigkeit (lineare SVM):") + print(feature_importance) + + # Visualisierung + plt.figure(figsize=(10, 6)) + sns.barplot(data=feature_importance.head(10), x='importance', y='feature') + plt.title('Top 10 Feature-Wichtigkeiten') + plt.xlabel('Wichtigkeit') + plt.tight_layout() + plt.savefig('feature_importance.png', dpi=300, bbox_inches='tight') + plt.show() + else: + print("\nFeature-Wichtigkeit nur für lineare SVM verfügbar") + +def main(): + + print("Starte SVM-Modell für Sportarten-Klassifikation...") + + try: + # Daten laden und vorverarbeiten + df, features = load_and_preprocess_data() + print(f"Daten geladen: {df.shape[0]} Datensätze, {len(features)} Features") + print(f"Verteilung der Sportarten: {df['sport_category'].value_counts().to_dict()}") + + # Feature Engineering + X, y, feature_names, imputer = feature_engineering(df, features) + print(f"Feature Engineering abgeschlossen: {X.shape[1]} Features") + + # SVM-Modell trainieren + model, X_test, y_test, y_pred, best_params, cv_scores, scaler = train_svm_model(X, y) + + # Modell evaluieren + accuracy = evaluate_model(y_test, y_pred, cv_scores, best_params) + + # Feature-Analyse + feature_importance_analysis(model, X_test, feature_names) + + print(f"\nModell erfolgreich trainiert mit Genauigkeit: {accuracy:.4f}") + print("\nGespicherte Dateien: confusion_matrix.png, feature_importance.png") + + return model, scaler, imputer, feature_names + + except Exception as e: + print(f"Fehler bei der Ausführung: {str(e)}") + return None, None, None, None + +if __name__ == "__main__": model, scaler, imputer, feature_names = main() \ No newline at end of file