commit 3d2b6f208ed02e029e0c6ecbc19509d2cf6c725f Author: Adu Alex Göllnitz Date: Wed Nov 19 15:34:13 2025 +0100 Dateien nach "/" hochladen diff --git a/Activities_rohdaten.csv b/Activities_rohdaten.csv new file mode 100644 index 0000000..778d891 --- /dev/null +++ b/Activities_rohdaten.csv @@ -0,0 +1,21 @@ +Aktivitätstyp,Datum,Favorit,Titel,Distanz,Kalorien,Zeit,Ø Herzfrequenz,Maximale Herzfrequenz,Aerober TE,Ø Trittfrequenz,Max. Trittfrequenz,Ø Geschwindigkeit,Maximale Geschwindigkeit,Anstieg gesamt,Abstieg gesamt,Ø Schrittlänge,Durchschnittliches vertikales Verhältnis,Ø vertikale Bewegung,Ø Bodenkontaktzeit,Durchschnittliche Balance der Bodenkontaktzeit,Durchschnittliche SAP,Ø Trittfrequenz,Max. Trittfrequenz,Normalized Power® (NP®),Training Stress Score®,Ø Leistung,Max. Leistung,Schläge insgesamt,Ø Swolf,Ø Schlagrate,Schritte,Wiederholungen insgesamt,Sätze insgesamt,Minimale Temperatur,Dekompression,Beste Rundenzeit,Anzahl der Runden,Maximale Temperatur,Ø Atemfrequenz,Minimale Atemfrequenz,Maximale Atemfrequenz,Zeit in Bewegung,Verstrichene Zeit,Minimale Höhe,Maximale Höhe +Krafttraining,2025-09-23 18:03:52,false,"Krafttrai​ning","0.00","136","00:24:35","102","134","0.5","--","--","--","--","--","--","--","--","--","--","--","--","--","--","--","0.0","--","--","--","--","--","8","4","1","26.0","Nein","00:24:35","1","29.0","--","--","--","00:24:32","00:24:35","--","--" +Krafttraining,2025-09-22 19:03:03,false,"Krafttrai​ning","0.00","224","00:42:47","103","133","2.0","--","--","--","--","--","--","--","--","--","--","--","--","--","--","--","0.0","--","--","--","--","--","226","113","12","22.0","Nein","00:42:47","1","26.0","--","--","--","00:04:27.2","00:42:47","--","--" +Rennradfahren,2025-09-21 12:10:15,false,"Walenstadt Rennradfahren","87.59","1,724","02:48:41","158","179","4.5","--","--","31.2","61.2","441","439","--","--","--","--","--","--","86","111","--","0.0","--","--","14247","--","--","--","--","--","25.0","Nein","00:05:29.0","18","35.0","35","22","45","02:48:31","02:57:35","426","596" +Krafttraining,2025-09-19 19:08:18,false,"Krafttrai​ning","0.00","403","01:12:08","110","155","0.6","--","--","--","--","--","--","--","--","--","--","--","--","--","--","--","0.0","--","--","--","--","--","652","326","26","25.0","Nein","01:12:08","1","28.0","--","--","--","00:13:10","01:12:08","--","--" +Rennradfahren,2025-09-18 18:30:39,false,"Walenstadt Rennradfahren","30.01","671","00:56:08","152","185","3.2","--","--","32.1","52.2","128","120","--","--","--","--","--","--","86","120","--","0.0","--","--","4790","--","--","--","--","--","17.0","Nein","00:00:01.7","7","24.0","33","23","41","00:56:05","00:56:08","426","497" +Rennradfahren,2025-09-17 15:30:28,false,"Walenstadt Rennradfahren","62.83","1,379","02:06:52","157","196","4.1","--","--","29.7","79.2","518","514","--","--","--","--","--","--","85","120","--","0.0","--","--","10431","--","--","--","--","--","18.0","Nein","00:05:20.0","13","23.0","34","24","45","02:06:47","02:10:28","426","715" +Cardio,2025-09-16 13:59:18,false,"Cardio","0.00","471","02:12:50","83","146","1.1","--","--","--","--","--","--","--","--","--","--","--","--","--","--","--","0.0","--","--","--","--","--","1,032","--","--","--","Nein","02:12:50","1","--","--","--","--","00:00:00","02:12:50","--","--" +Laufen,2025-09-15 18:47:23,false,"Walenstadt Laufen","6.06","508","00:43:15","147","188","2.8","149","186","7:08","3:03","15","12","0.94","10.4","9.8","315","49.5 % Links / 50.5 % Rechts","7:11","--","--","337","0.0","330","695","--","--","--","6,390","--","--","--","Nein","00:00:12.5","7","--","32","23","41","00:42:52","00:43:15","417","426" +Schwimmbad,2025-09-15 13:33:41,false,"Schwimmbad","2,325","494","00:51:44","135","151","2.4","--","--","2:14","1:46","--","--","--","--","--","--","--","--","--","--","--","0.0","--","--","1241","47","24","--","--","--","--","Nein","00:51:44","1","--","--","--","--","00:51:44","00:52:40","--","--" +Rennradfahren,2025-09-14 12:24:30,false,"Walenstadt Rennradfahren","30.11","877","01:08:03","164","189","4.4","--","--","26.5","76.4","399","400","--","--","--","--","--","--","85","126","--","0.0","--","--","5456","--","--","--","--","--","18.0","Nein","00:00:12.4","7","25.0","35","24","47","01:07:58","01:11:44","426","729" +Rennradfahren,2025-09-12 17:02:31,false,"Walenstadt Rennradfahren","30.05","781","00:54:38","168","199","4.1","--","--","33.0","50.5","130","127","--","--","--","--","--","--","89","109","--","0.0","--","--","4737","--","--","--","--","--","16.0","Nein","00:00:04.0","7","19.0","37","26","46","00:54:32","00:59:57","427","499" +Krafttraining,2025-09-11 18:52:04,false,"Krafttrai​ning","0.00","220","00:39:14","105","166","0.8","--","--","--","--","--","--","--","--","--","--","--","--","--","--","--","0.0","--","--","--","--","--","220","110","12","24.0","Nein","00:39:14","1","26.0","--","--","--","00:06:50.4","00:39:14","--","--" +Rennradfahren,2025-09-10 19:00:07,false,"Walenstadt Rennradfahren","29.85","737","00:55:28","159","192","3.9","--","--","32.3","56.9","128","125","--","--","--","--","--","--","87","115","--","0.0","--","--","4628","--","--","--","--","--","16.0","Nein","00:07:12.5","6","22.0","34","19","43","00:55:25","00:55:28","427","498" +Radfahren,2025-09-10 17:47:49,false,"Radfahren","0.41","6","00:02:00","--","--","--","--","--","12.4","48.1","--","--","--","--","--","--","--","--","--","--","--","0.0","--","--","197","--","--","--","--","--","--","Nein","00:02:00","1","--","--","--","--","00:01:45","00:02:30","--","--" +Radfahren,2025-09-10 17:12:48,false,"Radfahren","0.21","6","00:02:05","--","--","--","--","--","6.2","41.9","--","--","--","--","--","--","--","--","--","--","--","0.0","--","--","102","--","--","--","--","--","--","Nein","00:02:05","1","--","--","--","--","00:00:55","00:09:00","--","--" +Krafttraining,2025-09-09 19:04:42,false,"Krafttrai​ning","0.00","278","00:53:18","100","139","0.6","--","--","--","--","--","--","--","--","--","--","--","--","--","--","--","0.0","--","--","--","--","--","420","210","17","24.0","Nein","00:53:18","1","27.0","--","--","--","00:07:01.5","00:53:18","--","--" +Krafttraining,2025-09-07 11:32:58,false,"Krafttrai​ning","0.00","272","00:34:46","119","151","1.3","--","--","--","--","--","--","--","--","--","--","--","--","--","--","--","0.0","--","--","--","--","--","202","101","11","26.0","Nein","00:34:46","1","27.0","--","--","--","00:04:58.5","00:34:46","--","--" +Radfahren,2025-09-06 11:23:07,false,"Radfahren","4.10","151","00:10:45","--","--","--","--","--","22.9","40.4","--","--","--","--","--","--","--","--","--","--","--","0.0","--","--","636","--","--","--","--","--","--","Nein","00:10:45","1","--","--","--","--","00:10:35","00:11:00","--","--" +Rennradfahren,2025-09-06 10:58:49,false,"Walenstadt Rennradfahren","126.36","2,401","04:28:11","145","175","3.9","--","--","28.3","54.7","597","573","--","--","--","--","--","--","80","163","--","0.0","--","--","19676","--","--","--","--","--","15.0","Nein","00:03:58.3","26","28.0","33","20","47","04:27:27","05:13:45","409","466" +Krafttraining,2025-09-05 13:32:08,false,"Krafttrai​nin","0.00","203","00:48:40","91","147","0.2","--","--","--","--","--","--","--","--","--","--","--","--","--","--","--","0.0","--","--","--","--","--","880","440","31","26.0","Nein","00:48:40","1","30.0","--","--","--","00:17:29","00:49:11","--","--" diff --git a/svm_modell.py b/svm_modell.py new file mode 100644 index 0000000..6815d9d --- /dev/null +++ b/svm_modell.py @@ -0,0 +1,204 @@ +# Machine Learning Modell 1 / Leistungsnachweis II Data Science / 1. Semester +# Support Vector Machine zu binären Klassifikation + +import pandas as pd +import numpy as np +from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV +from sklearn.preprocessing import StandardScaler, LabelEncoder +from sklearn.svm import SVC +from sklearn.metrics import classification_report, confusion_matrix, accuracy_score +from sklearn.impute import SimpleImputer +import matplotlib.pyplot as plt +import seaborn as sns +from datetime import datetime +import warnings +warnings.filterwarnings('ignore') + +def load_and_preprocess_data(): + + # Lädt und bereitet die Daten aus verschiedenen CSV-Dateien vor + + # CSV-Dateien laden + activities = pd.read_csv("Activities_rohdaten.csv", sep=None, engine="python") + + # Daten bereinigen und konsolidieren + df = activities.copy() + + # Spaltennamen bereinigen + df.columns = df.columns.str.strip() + + # Sportarten für binäre Klassifikation definieren + + # Wir klassifizieren zwischen 'Ausdauersport' und 'Kraftsport' + endurance_sports = ['Laufen', 'Rennradfahren', 'Schwimmen', 'Radfahren'] + strength_sports = ['Krafttraining'] + + # Binäre Zielvariable erstellen + df['sport_category'] = df['Aktivitätstyp'].apply(lambda x: 1 + if x in endurance_sports + else (0 if x in strength_sports else -1)) + + # Nur gültige Kategorien behalten + df = df[df['sport_category'] != -1] + + # Features auswählen, die für die Klassifikation relevant sind + numeric_features = ['Distanz', 'Kalorien', 'Ø Herzfrequenz', 'Maximale Herzfrequenz', 'Aerober TE', 'Training Stress Score®'] + + # Nur verfügbare numerische Features auswählen + available_features = [col for col in numeric_features if col in df.columns] + + return df, available_features + +def feature_engineering(df, features): # Führt Feature Engineering durch + + # Kopie erstellen + df_processed = df.copy() + + # Numerische Features bereinigen + for feature in features: + if df_processed[feature].dtype == 'object': + df_processed[feature] = df_processed[feature].astype(str).str.replace(',', '') + + # In numerische Werte umwandeln + df_processed[feature] = pd.to_numeric(df_processed[feature], errors='coerce') + + # Zeitbasierte Features aus Datum extrahieren + if 'Datum' in df_processed.columns: + df_processed['Datum'] = pd.to_datetime(df_processed['Datum'], errors='coerce') + df_processed['hour'] = df_processed['Datum'].dt.hour + df_processed['day_of_week'] = df_processed['Datum'].dt.dayofweek + df_processed['month'] = df_processed['Datum'].dt.month + + # Zeitbasierte Features hinzufügen + time_features = ['hour', 'day_of_week', 'month'] + features.extend(time_features) + + # Fehlende Werte behandeln + X = df_processed[features] + y = df_processed['sport_category'] + + # SimpleImputer für fehlende Werte + imputer = SimpleImputer(strategy='median') + X_imputed = imputer.fit_transform(X) + + return X_imputed, y, features, imputer + +def train_svm_model(X, y): # Trainiert das SVM-Modell mit Hyperparameter-Tuning + + # Daten in Trainings- und Testsets aufteilen + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y) + + # Features skalieren + scaler = StandardScaler() + X_train_scaled = scaler.fit_transform(X_train) + X_test_scaled = scaler.transform(X_test) + + # SVM-Modell mit Hyperparameter-Tuning + param_grid = { + 'C': [0.1, 1, 10, 100], + 'gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1], + 'kernel': ['rbf', 'linear'] + } + + svm = SVC(random_state=42) + grid_search = GridSearchCV(svm, param_grid, cv=5, scoring='accuracy', n_jobs=-1) + grid_search.fit(X_train_scaled, y_train) + + # Bestes Modell + best_svm = grid_search.best_estimator_ + + # Vorhersagen + y_pred = best_svm.predict(X_test_scaled) + + # Kreuzvalidierung + cv_scores = cross_val_score(best_svm, X_train_scaled, y_train, cv=5) + + return best_svm, X_test_scaled, y_test, y_pred, grid_search.best_params_, cv_scores, scaler + +def evaluate_model(y_test, y_pred, cv_scores, best_params): # Evaluiert das Modell und gibt Ergebnisse aus + + print("=== SVM-Modell für Sportarten-Klassifikation ===") + print(f"Anzahl der Datensätze: {len(y_test) + len(y_pred)}") + print(f"\nBeste Hyperparameter: {best_params}") + + print(f"\nKreuzvalidierung (CV-Scores): {cv_scores}") + print(f"Mittlere CV-Genauigkeit: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})") + + print(f"\nTest-Genauigkeit: {accuracy_score(y_test, y_pred):.4f}") + + print("\nKlassifikationsbericht:") + print(classification_report(y_test, y_pred, target_names=['Kraftsport', 'Ausdauersport'])) + + print("\nKonfusionsmatrix:") + cm = confusion_matrix(y_test, y_pred) + print(cm) + + # Visualisierung der Konfusionsmatrix + plt.figure(figsize=(8, 6)) + sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', + xticklabels=['Kraftsport', 'Ausdauersport'], + yticklabels=['Kraftsport', 'Ausdauersport']) + plt.title('Konfusionsmatrix - SVM Sportarten-Klassifikation') + plt.ylabel('Wahre Klasse') + plt.xlabel('Vorhergesagte Klasse') + plt.tight_layout() + plt.savefig('confusion_matrix.png', dpi=300, bbox_inches='tight') + plt.show() + + return accuracy_score(y_test, y_pred) + +def feature_importance_analysis(model, X_test, feature_names): # Analysiert die Wichtigkeit der Features (für lineare SVM) + + if model.kernel == 'linear': + # Feature-Wichtigkeit für lineare SVM + importance = np.abs(model.coef_[0]) + feature_importance = pd.DataFrame({'feature': feature_names, 'importance': importance}).sort_values('importance', ascending=False) + + print("\nFeature-Wichtigkeit (lineare SVM):") + print(feature_importance) + + # Visualisierung + plt.figure(figsize=(10, 6)) + sns.barplot(data=feature_importance.head(10), x='importance', y='feature') + plt.title('Top 10 Feature-Wichtigkeiten') + plt.xlabel('Wichtigkeit') + plt.tight_layout() + plt.savefig('feature_importance.png', dpi=300, bbox_inches='tight') + plt.show() + else: + print("\nFeature-Wichtigkeit nur für lineare SVM verfügbar") + +def main(): + + print("Starte SVM-Modell für Sportarten-Klassifikation...") + + try: + # 1. Daten laden und vorverarbeiten + df, features = load_and_preprocess_data() + print(f"Daten geladen: {df.shape[0]} Datensätze, {len(features)} Features") + print(f"Verteilung der Sportarten: {df['sport_category'].value_counts().to_dict()}") + + # 2. Feature Engineering + X, y, feature_names, imputer = feature_engineering(df, features) + print(f"Feature Engineering abgeschlossen: {X.shape[1]} Features") + + # 3. SVM-Modell trainieren + model, X_test, y_test, y_pred, best_params, cv_scores, scaler = train_svm_model(X, y) + + # 4. Modell evaluieren + accuracy = evaluate_model(y_test, y_pred, cv_scores, best_params) + + # 5. Feature-Analyse + feature_importance_analysis(model, X_test, feature_names) + + print(f"\nModell erfolgreich trainiert mit Genauigkeit: {accuracy:.4f}") + print("\nGespicherte Dateien: confusion_matrix.png, feature_importance.png") + + return model, scaler, imputer, feature_names + + except Exception as e: + print(f"Fehler bei der Ausführung: {str(e)}") + return None, None, None, None + +if __name__ == "__main__": + model, scaler, imputer, feature_names = main() \ No newline at end of file