svm_modell.py aktualisiert

This commit is contained in:
Adu Alex Göllnitz 2025-11-19 15:41:28 +01:00
parent 3d2b6f208e
commit b15301a2b2

View File

@ -1,204 +1,201 @@
# Machine Learning Modell 1 / Leistungsnachweis II Data Science / 1. Semester # Machine Learning Modell 1 / Leistungsnachweis II Data Science / 1. Semester
# Support Vector Machine zu binären Klassifikation # Support Vector Machine zu binären Klassifikation
import pandas as pd import pandas as pd
import numpy as np import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.svm import SVC from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.impute import SimpleImputer from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import seaborn as sns import seaborn as sns
from datetime import datetime from datetime import datetime
import warnings import warnings
warnings.filterwarnings('ignore') warnings.filterwarnings('ignore')
def load_and_preprocess_data(): def load_and_preprocess_data():
# Lädt und bereitet die Daten aus verschiedenen CSV-Dateien vor # CSV-Dateien laden
activities = pd.read_csv("Activities_rohdaten.csv", sep=None, engine="python")
# CSV-Dateien laden
activities = pd.read_csv("Activities_rohdaten.csv", sep=None, engine="python") # Daten bereinigen und konsolidieren
df = activities.copy()
# Daten bereinigen und konsolidieren
df = activities.copy() # Spaltennamen bereinigen
df.columns = df.columns.str.strip()
# Spaltennamen bereinigen
df.columns = df.columns.str.strip() # Sportarten für binäre Klassifikation definieren
# Wir klassifizieren zwischen 'Ausdauersport' und 'Kraftsport'
# Sportarten für binäre Klassifikation definieren endurance_sports = ['Laufen', 'Rennradfahren', 'Schwimmen', 'Radfahren']
strength_sports = ['Krafttraining']
# Wir klassifizieren zwischen 'Ausdauersport' und 'Kraftsport'
endurance_sports = ['Laufen', 'Rennradfahren', 'Schwimmen', 'Radfahren'] # Binäre Zielvariable erstellen
strength_sports = ['Krafttraining'] df['sport_category'] = df['Aktivitätstyp'].apply(lambda x: 1
if x in endurance_sports
# Binäre Zielvariable erstellen else (0 if x in strength_sports else -1))
df['sport_category'] = df['Aktivitätstyp'].apply(lambda x: 1
if x in endurance_sports # Nur gültige Kategorien behalten
else (0 if x in strength_sports else -1)) df = df[df['sport_category'] != -1]
# Nur gültige Kategorien behalten # Features auswählen:
df = df[df['sport_category'] != -1] numeric_features = ['Distanz', 'Kalorien', 'Ø Herzfrequenz', 'Maximale Herzfrequenz', 'Aerober TE', 'Training Stress Score®']
# Features auswählen, die für die Klassifikation relevant sind # Nur verfügbare numerische Features auswählen
numeric_features = ['Distanz', 'Kalorien', 'Ø Herzfrequenz', 'Maximale Herzfrequenz', 'Aerober TE', 'Training Stress Score®'] available_features = [col for col in numeric_features if col in df.columns]
# Nur verfügbare numerische Features auswählen return df, available_features
available_features = [col for col in numeric_features if col in df.columns]
def feature_engineering(df, features): # Führt Feature Engineering durch
return df, available_features
# Kopie erstellen
def feature_engineering(df, features): # Führt Feature Engineering durch df_processed = df.copy()
# Kopie erstellen # Numerische Features bereinigen
df_processed = df.copy() for feature in features:
if df_processed[feature].dtype == 'object':
# Numerische Features bereinigen df_processed[feature] = df_processed[feature].astype(str).str.replace(',', '')
for feature in features:
if df_processed[feature].dtype == 'object': # In numerische Werte umwandeln
df_processed[feature] = df_processed[feature].astype(str).str.replace(',', '') df_processed[feature] = pd.to_numeric(df_processed[feature], errors='coerce')
# In numerische Werte umwandeln # Zeitbasierte Features aus Datum extrahieren
df_processed[feature] = pd.to_numeric(df_processed[feature], errors='coerce') if 'Datum' in df_processed.columns:
df_processed['Datum'] = pd.to_datetime(df_processed['Datum'], errors='coerce')
# Zeitbasierte Features aus Datum extrahieren df_processed['hour'] = df_processed['Datum'].dt.hour
if 'Datum' in df_processed.columns: df_processed['day_of_week'] = df_processed['Datum'].dt.dayofweek
df_processed['Datum'] = pd.to_datetime(df_processed['Datum'], errors='coerce') df_processed['month'] = df_processed['Datum'].dt.month
df_processed['hour'] = df_processed['Datum'].dt.hour
df_processed['day_of_week'] = df_processed['Datum'].dt.dayofweek # Zeitbasierte Features hinzufügen
df_processed['month'] = df_processed['Datum'].dt.month time_features = ['hour', 'day_of_week', 'month']
features.extend(time_features)
# Zeitbasierte Features hinzufügen
time_features = ['hour', 'day_of_week', 'month'] # Fehlende Werte behandeln
features.extend(time_features) X = df_processed[features]
y = df_processed['sport_category']
# Fehlende Werte behandeln
X = df_processed[features] # SimpleImputer für fehlende Werte
y = df_processed['sport_category'] imputer = SimpleImputer(strategy='median')
X_imputed = imputer.fit_transform(X)
# SimpleImputer für fehlende Werte
imputer = SimpleImputer(strategy='median') return X_imputed, y, features, imputer
X_imputed = imputer.fit_transform(X)
def train_svm_model(X, y): # Trainiert das SVM-Modell mit Hyperparameter-Tuning
return X_imputed, y, features, imputer
# Daten in Trainings- und Testsets aufteilen
def train_svm_model(X, y): # Trainiert das SVM-Modell mit Hyperparameter-Tuning X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
# Daten in Trainings- und Testsets aufteilen # Features skalieren
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y) scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
# Features skalieren X_test_scaled = scaler.transform(X_test)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train) # SVM-Modell mit Hyperparameter-Tuning
X_test_scaled = scaler.transform(X_test) param_grid = {
'C': [0.1, 1, 10, 100],
# SVM-Modell mit Hyperparameter-Tuning 'gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1],
param_grid = { 'kernel': ['rbf', 'linear']
'C': [0.1, 1, 10, 100], }
'gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1],
'kernel': ['rbf', 'linear'] svm = SVC(random_state=42)
} grid_search = GridSearchCV(svm, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
svm = SVC(random_state=42)
grid_search = GridSearchCV(svm, param_grid, cv=5, scoring='accuracy', n_jobs=-1) # Bestes Modell
grid_search.fit(X_train_scaled, y_train) best_svm = grid_search.best_estimator_
# Bestes Modell # Vorhersagen
best_svm = grid_search.best_estimator_ y_pred = best_svm.predict(X_test_scaled)
# Vorhersagen # Kreuzvalidierung
y_pred = best_svm.predict(X_test_scaled) cv_scores = cross_val_score(best_svm, X_train_scaled, y_train, cv=5)
# Kreuzvalidierung return best_svm, X_test_scaled, y_test, y_pred, grid_search.best_params_, cv_scores, scaler
cv_scores = cross_val_score(best_svm, X_train_scaled, y_train, cv=5)
def evaluate_model(y_test, y_pred, cv_scores, best_params): # Evaluiert das Modell und gibt Ergebnisse aus
return best_svm, X_test_scaled, y_test, y_pred, grid_search.best_params_, cv_scores, scaler
print("=== SVM-Modell für Sportarten-Klassifikation ===")
def evaluate_model(y_test, y_pred, cv_scores, best_params): # Evaluiert das Modell und gibt Ergebnisse aus print(f"Anzahl der Datensätze: {len(y_test) + len(y_pred)}")
print(f"\nBeste Hyperparameter: {best_params}")
print("=== SVM-Modell für Sportarten-Klassifikation ===")
print(f"Anzahl der Datensätze: {len(y_test) + len(y_pred)}") print(f"\nKreuzvalidierung (CV-Scores): {cv_scores}")
print(f"\nBeste Hyperparameter: {best_params}") print(f"Mittlere CV-Genauigkeit: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
print(f"\nKreuzvalidierung (CV-Scores): {cv_scores}") print(f"\nTest-Genauigkeit: {accuracy_score(y_test, y_pred):.4f}")
print(f"Mittlere CV-Genauigkeit: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
print("\nKlassifikationsbericht:")
print(f"\nTest-Genauigkeit: {accuracy_score(y_test, y_pred):.4f}") print(classification_report(y_test, y_pred, target_names=['Kraftsport', 'Ausdauersport']))
print("\nKlassifikationsbericht:") print("\nKonfusionsmatrix:")
print(classification_report(y_test, y_pred, target_names=['Kraftsport', 'Ausdauersport'])) cm = confusion_matrix(y_test, y_pred)
print(cm)
print("\nKonfusionsmatrix:")
cm = confusion_matrix(y_test, y_pred) # Visualisierung der Konfusionsmatrix
print(cm) plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
# Visualisierung der Konfusionsmatrix xticklabels=['Kraftsport', 'Ausdauersport'],
plt.figure(figsize=(8, 6)) yticklabels=['Kraftsport', 'Ausdauersport'])
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', plt.title('Konfusionsmatrix - SVM Sportarten-Klassifikation')
xticklabels=['Kraftsport', 'Ausdauersport'], plt.ylabel('Wahre Klasse')
yticklabels=['Kraftsport', 'Ausdauersport']) plt.xlabel('Vorhergesagte Klasse')
plt.title('Konfusionsmatrix - SVM Sportarten-Klassifikation') plt.tight_layout()
plt.ylabel('Wahre Klasse') plt.savefig('confusion_matrix.png', dpi=300, bbox_inches='tight')
plt.xlabel('Vorhergesagte Klasse') plt.show()
plt.tight_layout()
plt.savefig('confusion_matrix.png', dpi=300, bbox_inches='tight') return accuracy_score(y_test, y_pred)
plt.show()
def feature_importance_analysis(model, X_test, feature_names): # Analysiert die Wichtigkeit der Features (für lineare SVM)
return accuracy_score(y_test, y_pred)
if model.kernel == 'linear':
def feature_importance_analysis(model, X_test, feature_names): # Analysiert die Wichtigkeit der Features (für lineare SVM) # Feature-Wichtigkeit für lineare SVM
importance = np.abs(model.coef_[0])
if model.kernel == 'linear': feature_importance = pd.DataFrame({'feature': feature_names, 'importance': importance}).sort_values('importance', ascending=False)
# Feature-Wichtigkeit für lineare SVM
importance = np.abs(model.coef_[0]) print("\nFeature-Wichtigkeit (lineare SVM):")
feature_importance = pd.DataFrame({'feature': feature_names, 'importance': importance}).sort_values('importance', ascending=False) print(feature_importance)
print("\nFeature-Wichtigkeit (lineare SVM):") # Visualisierung
print(feature_importance) plt.figure(figsize=(10, 6))
sns.barplot(data=feature_importance.head(10), x='importance', y='feature')
# Visualisierung plt.title('Top 10 Feature-Wichtigkeiten')
plt.figure(figsize=(10, 6)) plt.xlabel('Wichtigkeit')
sns.barplot(data=feature_importance.head(10), x='importance', y='feature') plt.tight_layout()
plt.title('Top 10 Feature-Wichtigkeiten') plt.savefig('feature_importance.png', dpi=300, bbox_inches='tight')
plt.xlabel('Wichtigkeit') plt.show()
plt.tight_layout() else:
plt.savefig('feature_importance.png', dpi=300, bbox_inches='tight') print("\nFeature-Wichtigkeit nur für lineare SVM verfügbar")
plt.show()
else: def main():
print("\nFeature-Wichtigkeit nur für lineare SVM verfügbar")
print("Starte SVM-Modell für Sportarten-Klassifikation...")
def main():
try:
print("Starte SVM-Modell für Sportarten-Klassifikation...") # Daten laden und vorverarbeiten
df, features = load_and_preprocess_data()
try: print(f"Daten geladen: {df.shape[0]} Datensätze, {len(features)} Features")
# 1. Daten laden und vorverarbeiten print(f"Verteilung der Sportarten: {df['sport_category'].value_counts().to_dict()}")
df, features = load_and_preprocess_data()
print(f"Daten geladen: {df.shape[0]} Datensätze, {len(features)} Features") # Feature Engineering
print(f"Verteilung der Sportarten: {df['sport_category'].value_counts().to_dict()}") X, y, feature_names, imputer = feature_engineering(df, features)
print(f"Feature Engineering abgeschlossen: {X.shape[1]} Features")
# 2. Feature Engineering
X, y, feature_names, imputer = feature_engineering(df, features) # SVM-Modell trainieren
print(f"Feature Engineering abgeschlossen: {X.shape[1]} Features") model, X_test, y_test, y_pred, best_params, cv_scores, scaler = train_svm_model(X, y)
# 3. SVM-Modell trainieren # Modell evaluieren
model, X_test, y_test, y_pred, best_params, cv_scores, scaler = train_svm_model(X, y) accuracy = evaluate_model(y_test, y_pred, cv_scores, best_params)
# 4. Modell evaluieren # Feature-Analyse
accuracy = evaluate_model(y_test, y_pred, cv_scores, best_params) feature_importance_analysis(model, X_test, feature_names)
# 5. Feature-Analyse print(f"\nModell erfolgreich trainiert mit Genauigkeit: {accuracy:.4f}")
feature_importance_analysis(model, X_test, feature_names) print("\nGespicherte Dateien: confusion_matrix.png, feature_importance.png")
print(f"\nModell erfolgreich trainiert mit Genauigkeit: {accuracy:.4f}") return model, scaler, imputer, feature_names
print("\nGespicherte Dateien: confusion_matrix.png, feature_importance.png")
except Exception as e:
return model, scaler, imputer, feature_names print(f"Fehler bei der Ausführung: {str(e)}")
return None, None, None, None
except Exception as e:
print(f"Fehler bei der Ausführung: {str(e)}") if __name__ == "__main__":
return None, None, None, None
if __name__ == "__main__":
model, scaler, imputer, feature_names = main() model, scaler, imputer, feature_names = main()