svm_modell.py aktualisiert
This commit is contained in:
parent
3d2b6f208e
commit
b15301a2b2
403
svm_modell.py
403
svm_modell.py
@ -1,204 +1,201 @@
|
|||||||
# Machine Learning Modell 1 / Leistungsnachweis II Data Science / 1. Semester
|
# Machine Learning Modell 1 / Leistungsnachweis II Data Science / 1. Semester
|
||||||
# Support Vector Machine zu binären Klassifikation
|
# Support Vector Machine zu binären Klassifikation
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
|
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
|
||||||
from sklearn.preprocessing import StandardScaler, LabelEncoder
|
from sklearn.preprocessing import StandardScaler, LabelEncoder
|
||||||
from sklearn.svm import SVC
|
from sklearn.svm import SVC
|
||||||
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
|
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
|
||||||
from sklearn.impute import SimpleImputer
|
from sklearn.impute import SimpleImputer
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
import seaborn as sns
|
import seaborn as sns
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
import warnings
|
import warnings
|
||||||
warnings.filterwarnings('ignore')
|
warnings.filterwarnings('ignore')
|
||||||
|
|
||||||
def load_and_preprocess_data():
|
def load_and_preprocess_data():
|
||||||
|
|
||||||
# Lädt und bereitet die Daten aus verschiedenen CSV-Dateien vor
|
# CSV-Dateien laden
|
||||||
|
activities = pd.read_csv("Activities_rohdaten.csv", sep=None, engine="python")
|
||||||
# CSV-Dateien laden
|
|
||||||
activities = pd.read_csv("Activities_rohdaten.csv", sep=None, engine="python")
|
# Daten bereinigen und konsolidieren
|
||||||
|
df = activities.copy()
|
||||||
# Daten bereinigen und konsolidieren
|
|
||||||
df = activities.copy()
|
# Spaltennamen bereinigen
|
||||||
|
df.columns = df.columns.str.strip()
|
||||||
# Spaltennamen bereinigen
|
|
||||||
df.columns = df.columns.str.strip()
|
# Sportarten für binäre Klassifikation definieren
|
||||||
|
# Wir klassifizieren zwischen 'Ausdauersport' und 'Kraftsport'
|
||||||
# Sportarten für binäre Klassifikation definieren
|
endurance_sports = ['Laufen', 'Rennradfahren', 'Schwimmen', 'Radfahren']
|
||||||
|
strength_sports = ['Krafttraining']
|
||||||
# Wir klassifizieren zwischen 'Ausdauersport' und 'Kraftsport'
|
|
||||||
endurance_sports = ['Laufen', 'Rennradfahren', 'Schwimmen', 'Radfahren']
|
# Binäre Zielvariable erstellen
|
||||||
strength_sports = ['Krafttraining']
|
df['sport_category'] = df['Aktivitätstyp'].apply(lambda x: 1
|
||||||
|
if x in endurance_sports
|
||||||
# Binäre Zielvariable erstellen
|
else (0 if x in strength_sports else -1))
|
||||||
df['sport_category'] = df['Aktivitätstyp'].apply(lambda x: 1
|
|
||||||
if x in endurance_sports
|
# Nur gültige Kategorien behalten
|
||||||
else (0 if x in strength_sports else -1))
|
df = df[df['sport_category'] != -1]
|
||||||
|
|
||||||
# Nur gültige Kategorien behalten
|
# Features auswählen:
|
||||||
df = df[df['sport_category'] != -1]
|
numeric_features = ['Distanz', 'Kalorien', 'Ø Herzfrequenz', 'Maximale Herzfrequenz', 'Aerober TE', 'Training Stress Score®']
|
||||||
|
|
||||||
# Features auswählen, die für die Klassifikation relevant sind
|
# Nur verfügbare numerische Features auswählen
|
||||||
numeric_features = ['Distanz', 'Kalorien', 'Ø Herzfrequenz', 'Maximale Herzfrequenz', 'Aerober TE', 'Training Stress Score®']
|
available_features = [col for col in numeric_features if col in df.columns]
|
||||||
|
|
||||||
# Nur verfügbare numerische Features auswählen
|
return df, available_features
|
||||||
available_features = [col for col in numeric_features if col in df.columns]
|
|
||||||
|
def feature_engineering(df, features): # Führt Feature Engineering durch
|
||||||
return df, available_features
|
|
||||||
|
# Kopie erstellen
|
||||||
def feature_engineering(df, features): # Führt Feature Engineering durch
|
df_processed = df.copy()
|
||||||
|
|
||||||
# Kopie erstellen
|
# Numerische Features bereinigen
|
||||||
df_processed = df.copy()
|
for feature in features:
|
||||||
|
if df_processed[feature].dtype == 'object':
|
||||||
# Numerische Features bereinigen
|
df_processed[feature] = df_processed[feature].astype(str).str.replace(',', '')
|
||||||
for feature in features:
|
|
||||||
if df_processed[feature].dtype == 'object':
|
# In numerische Werte umwandeln
|
||||||
df_processed[feature] = df_processed[feature].astype(str).str.replace(',', '')
|
df_processed[feature] = pd.to_numeric(df_processed[feature], errors='coerce')
|
||||||
|
|
||||||
# In numerische Werte umwandeln
|
# Zeitbasierte Features aus Datum extrahieren
|
||||||
df_processed[feature] = pd.to_numeric(df_processed[feature], errors='coerce')
|
if 'Datum' in df_processed.columns:
|
||||||
|
df_processed['Datum'] = pd.to_datetime(df_processed['Datum'], errors='coerce')
|
||||||
# Zeitbasierte Features aus Datum extrahieren
|
df_processed['hour'] = df_processed['Datum'].dt.hour
|
||||||
if 'Datum' in df_processed.columns:
|
df_processed['day_of_week'] = df_processed['Datum'].dt.dayofweek
|
||||||
df_processed['Datum'] = pd.to_datetime(df_processed['Datum'], errors='coerce')
|
df_processed['month'] = df_processed['Datum'].dt.month
|
||||||
df_processed['hour'] = df_processed['Datum'].dt.hour
|
|
||||||
df_processed['day_of_week'] = df_processed['Datum'].dt.dayofweek
|
# Zeitbasierte Features hinzufügen
|
||||||
df_processed['month'] = df_processed['Datum'].dt.month
|
time_features = ['hour', 'day_of_week', 'month']
|
||||||
|
features.extend(time_features)
|
||||||
# Zeitbasierte Features hinzufügen
|
|
||||||
time_features = ['hour', 'day_of_week', 'month']
|
# Fehlende Werte behandeln
|
||||||
features.extend(time_features)
|
X = df_processed[features]
|
||||||
|
y = df_processed['sport_category']
|
||||||
# Fehlende Werte behandeln
|
|
||||||
X = df_processed[features]
|
# SimpleImputer für fehlende Werte
|
||||||
y = df_processed['sport_category']
|
imputer = SimpleImputer(strategy='median')
|
||||||
|
X_imputed = imputer.fit_transform(X)
|
||||||
# SimpleImputer für fehlende Werte
|
|
||||||
imputer = SimpleImputer(strategy='median')
|
return X_imputed, y, features, imputer
|
||||||
X_imputed = imputer.fit_transform(X)
|
|
||||||
|
def train_svm_model(X, y): # Trainiert das SVM-Modell mit Hyperparameter-Tuning
|
||||||
return X_imputed, y, features, imputer
|
|
||||||
|
# Daten in Trainings- und Testsets aufteilen
|
||||||
def train_svm_model(X, y): # Trainiert das SVM-Modell mit Hyperparameter-Tuning
|
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
|
||||||
|
|
||||||
# Daten in Trainings- und Testsets aufteilen
|
# Features skalieren
|
||||||
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
|
scaler = StandardScaler()
|
||||||
|
X_train_scaled = scaler.fit_transform(X_train)
|
||||||
# Features skalieren
|
X_test_scaled = scaler.transform(X_test)
|
||||||
scaler = StandardScaler()
|
|
||||||
X_train_scaled = scaler.fit_transform(X_train)
|
# SVM-Modell mit Hyperparameter-Tuning
|
||||||
X_test_scaled = scaler.transform(X_test)
|
param_grid = {
|
||||||
|
'C': [0.1, 1, 10, 100],
|
||||||
# SVM-Modell mit Hyperparameter-Tuning
|
'gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1],
|
||||||
param_grid = {
|
'kernel': ['rbf', 'linear']
|
||||||
'C': [0.1, 1, 10, 100],
|
}
|
||||||
'gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1],
|
|
||||||
'kernel': ['rbf', 'linear']
|
svm = SVC(random_state=42)
|
||||||
}
|
grid_search = GridSearchCV(svm, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
|
||||||
|
grid_search.fit(X_train_scaled, y_train)
|
||||||
svm = SVC(random_state=42)
|
|
||||||
grid_search = GridSearchCV(svm, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
|
# Bestes Modell
|
||||||
grid_search.fit(X_train_scaled, y_train)
|
best_svm = grid_search.best_estimator_
|
||||||
|
|
||||||
# Bestes Modell
|
# Vorhersagen
|
||||||
best_svm = grid_search.best_estimator_
|
y_pred = best_svm.predict(X_test_scaled)
|
||||||
|
|
||||||
# Vorhersagen
|
# Kreuzvalidierung
|
||||||
y_pred = best_svm.predict(X_test_scaled)
|
cv_scores = cross_val_score(best_svm, X_train_scaled, y_train, cv=5)
|
||||||
|
|
||||||
# Kreuzvalidierung
|
return best_svm, X_test_scaled, y_test, y_pred, grid_search.best_params_, cv_scores, scaler
|
||||||
cv_scores = cross_val_score(best_svm, X_train_scaled, y_train, cv=5)
|
|
||||||
|
def evaluate_model(y_test, y_pred, cv_scores, best_params): # Evaluiert das Modell und gibt Ergebnisse aus
|
||||||
return best_svm, X_test_scaled, y_test, y_pred, grid_search.best_params_, cv_scores, scaler
|
|
||||||
|
print("=== SVM-Modell für Sportarten-Klassifikation ===")
|
||||||
def evaluate_model(y_test, y_pred, cv_scores, best_params): # Evaluiert das Modell und gibt Ergebnisse aus
|
print(f"Anzahl der Datensätze: {len(y_test) + len(y_pred)}")
|
||||||
|
print(f"\nBeste Hyperparameter: {best_params}")
|
||||||
print("=== SVM-Modell für Sportarten-Klassifikation ===")
|
|
||||||
print(f"Anzahl der Datensätze: {len(y_test) + len(y_pred)}")
|
print(f"\nKreuzvalidierung (CV-Scores): {cv_scores}")
|
||||||
print(f"\nBeste Hyperparameter: {best_params}")
|
print(f"Mittlere CV-Genauigkeit: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
|
||||||
|
|
||||||
print(f"\nKreuzvalidierung (CV-Scores): {cv_scores}")
|
print(f"\nTest-Genauigkeit: {accuracy_score(y_test, y_pred):.4f}")
|
||||||
print(f"Mittlere CV-Genauigkeit: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
|
|
||||||
|
print("\nKlassifikationsbericht:")
|
||||||
print(f"\nTest-Genauigkeit: {accuracy_score(y_test, y_pred):.4f}")
|
print(classification_report(y_test, y_pred, target_names=['Kraftsport', 'Ausdauersport']))
|
||||||
|
|
||||||
print("\nKlassifikationsbericht:")
|
print("\nKonfusionsmatrix:")
|
||||||
print(classification_report(y_test, y_pred, target_names=['Kraftsport', 'Ausdauersport']))
|
cm = confusion_matrix(y_test, y_pred)
|
||||||
|
print(cm)
|
||||||
print("\nKonfusionsmatrix:")
|
|
||||||
cm = confusion_matrix(y_test, y_pred)
|
# Visualisierung der Konfusionsmatrix
|
||||||
print(cm)
|
plt.figure(figsize=(8, 6))
|
||||||
|
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
|
||||||
# Visualisierung der Konfusionsmatrix
|
xticklabels=['Kraftsport', 'Ausdauersport'],
|
||||||
plt.figure(figsize=(8, 6))
|
yticklabels=['Kraftsport', 'Ausdauersport'])
|
||||||
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
|
plt.title('Konfusionsmatrix - SVM Sportarten-Klassifikation')
|
||||||
xticklabels=['Kraftsport', 'Ausdauersport'],
|
plt.ylabel('Wahre Klasse')
|
||||||
yticklabels=['Kraftsport', 'Ausdauersport'])
|
plt.xlabel('Vorhergesagte Klasse')
|
||||||
plt.title('Konfusionsmatrix - SVM Sportarten-Klassifikation')
|
plt.tight_layout()
|
||||||
plt.ylabel('Wahre Klasse')
|
plt.savefig('confusion_matrix.png', dpi=300, bbox_inches='tight')
|
||||||
plt.xlabel('Vorhergesagte Klasse')
|
plt.show()
|
||||||
plt.tight_layout()
|
|
||||||
plt.savefig('confusion_matrix.png', dpi=300, bbox_inches='tight')
|
return accuracy_score(y_test, y_pred)
|
||||||
plt.show()
|
|
||||||
|
def feature_importance_analysis(model, X_test, feature_names): # Analysiert die Wichtigkeit der Features (für lineare SVM)
|
||||||
return accuracy_score(y_test, y_pred)
|
|
||||||
|
if model.kernel == 'linear':
|
||||||
def feature_importance_analysis(model, X_test, feature_names): # Analysiert die Wichtigkeit der Features (für lineare SVM)
|
# Feature-Wichtigkeit für lineare SVM
|
||||||
|
importance = np.abs(model.coef_[0])
|
||||||
if model.kernel == 'linear':
|
feature_importance = pd.DataFrame({'feature': feature_names, 'importance': importance}).sort_values('importance', ascending=False)
|
||||||
# Feature-Wichtigkeit für lineare SVM
|
|
||||||
importance = np.abs(model.coef_[0])
|
print("\nFeature-Wichtigkeit (lineare SVM):")
|
||||||
feature_importance = pd.DataFrame({'feature': feature_names, 'importance': importance}).sort_values('importance', ascending=False)
|
print(feature_importance)
|
||||||
|
|
||||||
print("\nFeature-Wichtigkeit (lineare SVM):")
|
# Visualisierung
|
||||||
print(feature_importance)
|
plt.figure(figsize=(10, 6))
|
||||||
|
sns.barplot(data=feature_importance.head(10), x='importance', y='feature')
|
||||||
# Visualisierung
|
plt.title('Top 10 Feature-Wichtigkeiten')
|
||||||
plt.figure(figsize=(10, 6))
|
plt.xlabel('Wichtigkeit')
|
||||||
sns.barplot(data=feature_importance.head(10), x='importance', y='feature')
|
plt.tight_layout()
|
||||||
plt.title('Top 10 Feature-Wichtigkeiten')
|
plt.savefig('feature_importance.png', dpi=300, bbox_inches='tight')
|
||||||
plt.xlabel('Wichtigkeit')
|
plt.show()
|
||||||
plt.tight_layout()
|
else:
|
||||||
plt.savefig('feature_importance.png', dpi=300, bbox_inches='tight')
|
print("\nFeature-Wichtigkeit nur für lineare SVM verfügbar")
|
||||||
plt.show()
|
|
||||||
else:
|
def main():
|
||||||
print("\nFeature-Wichtigkeit nur für lineare SVM verfügbar")
|
|
||||||
|
print("Starte SVM-Modell für Sportarten-Klassifikation...")
|
||||||
def main():
|
|
||||||
|
try:
|
||||||
print("Starte SVM-Modell für Sportarten-Klassifikation...")
|
# Daten laden und vorverarbeiten
|
||||||
|
df, features = load_and_preprocess_data()
|
||||||
try:
|
print(f"Daten geladen: {df.shape[0]} Datensätze, {len(features)} Features")
|
||||||
# 1. Daten laden und vorverarbeiten
|
print(f"Verteilung der Sportarten: {df['sport_category'].value_counts().to_dict()}")
|
||||||
df, features = load_and_preprocess_data()
|
|
||||||
print(f"Daten geladen: {df.shape[0]} Datensätze, {len(features)} Features")
|
# Feature Engineering
|
||||||
print(f"Verteilung der Sportarten: {df['sport_category'].value_counts().to_dict()}")
|
X, y, feature_names, imputer = feature_engineering(df, features)
|
||||||
|
print(f"Feature Engineering abgeschlossen: {X.shape[1]} Features")
|
||||||
# 2. Feature Engineering
|
|
||||||
X, y, feature_names, imputer = feature_engineering(df, features)
|
# SVM-Modell trainieren
|
||||||
print(f"Feature Engineering abgeschlossen: {X.shape[1]} Features")
|
model, X_test, y_test, y_pred, best_params, cv_scores, scaler = train_svm_model(X, y)
|
||||||
|
|
||||||
# 3. SVM-Modell trainieren
|
# Modell evaluieren
|
||||||
model, X_test, y_test, y_pred, best_params, cv_scores, scaler = train_svm_model(X, y)
|
accuracy = evaluate_model(y_test, y_pred, cv_scores, best_params)
|
||||||
|
|
||||||
# 4. Modell evaluieren
|
# Feature-Analyse
|
||||||
accuracy = evaluate_model(y_test, y_pred, cv_scores, best_params)
|
feature_importance_analysis(model, X_test, feature_names)
|
||||||
|
|
||||||
# 5. Feature-Analyse
|
print(f"\nModell erfolgreich trainiert mit Genauigkeit: {accuracy:.4f}")
|
||||||
feature_importance_analysis(model, X_test, feature_names)
|
print("\nGespicherte Dateien: confusion_matrix.png, feature_importance.png")
|
||||||
|
|
||||||
print(f"\nModell erfolgreich trainiert mit Genauigkeit: {accuracy:.4f}")
|
return model, scaler, imputer, feature_names
|
||||||
print("\nGespicherte Dateien: confusion_matrix.png, feature_importance.png")
|
|
||||||
|
except Exception as e:
|
||||||
return model, scaler, imputer, feature_names
|
print(f"Fehler bei der Ausführung: {str(e)}")
|
||||||
|
return None, None, None, None
|
||||||
except Exception as e:
|
|
||||||
print(f"Fehler bei der Ausführung: {str(e)}")
|
if __name__ == "__main__":
|
||||||
return None, None, None, None
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
model, scaler, imputer, feature_names = main()
|
model, scaler, imputer, feature_names = main()
|
||||||
Loading…
x
Reference in New Issue
Block a user