# EDA

In [None]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor

In [None]:
# import the data
cols = {
    "Aktivitätstyp": "activity_type",
    "Distanz": "distance_km",
    "Kalorien": "calories_burned",
    "Zeit": "duration_str",
    "Ø Herzfrequenz": "heart_rate"
}

data = pd.read_csv(
    "data/raw/all_activities.csv",
    usecols=list(cols.keys())
)
data.rename(columns=cols, inplace=True)

In [None]:
data.head()

In [None]:
data.shape

In [None]:
# Data Preprocessing
# Check for missing values
data.isnull().sum()

In [None]:
# change duration to seconds
data["duration_seconds"] = pd.to_timedelta(
    data["duration_str"]).dt.total_seconds()
data.drop(columns=["duration_str"], inplace=True)
data.head()

In [None]:
# get rid of commas in Distance and Calories Burned columns and convert to numeric
data['distance_km'] = pd.to_numeric(
    data['distance_km'].str.replace(',', ''), errors='coerce')
data['calories_burned'] = pd.to_numeric(
    data['calories_burned'].str.replace(',', ''), errors='coerce')
data['heart_rate'] = pd.to_numeric(
    data['heart_rate'].str.replace(',', ''), errors='coerce')
data.head()

In [None]:
# check data types
data.dtypes

In [None]:
# how many none values are there now
data.isnull().sum()

In [None]:
# drop rows with missing values
data = data.dropna()

In [None]:
# how many "--" in heart rate now
data['heart_rate'].value_counts().get('--', 0)

In [None]:
# drop rows with "--" in Heart Rate
data = data[data['heart_rate'] != '--']

In [None]:
data.shape

In [None]:
# change dtype of Duration (seconds), Calories Burned and Heart Rate to integer
data["duration_seconds"] = data["duration_seconds"].astype(int)
data["calories_burned"] = data["calories_burned"].astype(int)
data["heart_rate"] = data["heart_rate"].astype(int)
data.dtypes

In [None]:
# save the cleaned data to a new csv file
data.to_csv('data/cleaned/cleaned_activities.csv', index=False)

In [None]:
# how many unique activity types are there
data['activity_type'].nunique()

In [None]:
# check for rare categories in Activity Type
print(data['activity_type'].value_counts())

At this point we see there are 14 types of activities. To simplfy we can group some of similar activities. 

Gehen, Wandern --> Gehen
Indoor Cycling, Rennradfahren, Virtuelles Radfahren, Radfahren--> Radfahren

Since there are for Crosstrainer 3, Skifahren 2 and Stepper 1 rows it is better to drop these rows because our model cannot learn from so few examples.

In [None]:
# group similar activities
data['activity_type'] = data['activity_type'].replace({
    'Wandern': 'Gehen',
    'Indoor Cycling': 'Radfahren',
    'Rennradfahren': 'Radfahren',
    'Virtuelles Radfahren': 'Radfahren',
    'Laufbandtraining': 'Laufen'
})

# drop the rare activity types
rare_activities = ['Crosstrainer', 'Skifahren (Piste)', 'Stepper']
data = data[~data['activity_type'].isin(rare_activities)]

In [None]:
# checking the final counts of activity types
print(data['activity_type'].value_counts())

Now we have 7 Activity Types with occurences varying from 12 to 183. 

# Prepare training and test sets

Our data is cleaned and ready. 
Our first column "Activity Type" is string and we will use "One Hot Encoding" to involve it to our linear regression model.

In [None]:
# define features and target variable
features = ['activity_type', 'distance_km', 'duration_seconds', 'heart_rate']
target = 'calories_burned'

x = data[features]
y = data[target]

# split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42)
print(
    f"Data split into {x_train.shape[0]} training rows and {x_test.shape[0]} testing rows.")

# Preprocessing: Scaling numeric features and encoding categorical features
numeric_features = ['distance_km', 'duration_seconds', 'heart_rate']
categorical_features = ['activity_type']
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)

x_train = preprocessor.fit_transform(x_train)
x_test = preprocessor.transform(x_test)

print("\n--- Preprocessing complete ---")
print("Shape of processed training data:", x_train.shape)
print("Shape of processed testing data:", x_test.shape)

# Correlation Matrix

In [None]:

correlation_matrix = data.corr(numeric_only=True, method='pearson')

plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", center=0)
plt.title("Korrelationsmatrix der numerischen Merkmale (Pearson)")
plt.show()

In [None]:
# Evaluation Function
def evaluate_results(y_pred, y_true):
    mse = mean_squared_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    mae = np.mean(np.abs(y_true - y_pred))
    rmse = np.sqrt(mse)
    print(f"Mean Squared Error: {mse:.2f}")
    print(f"R^2 Score: {r2:.2f}")
    print(f"Mean Absolute Error: {mae:.2f}")
    print(f"Root Mean Squared Error: {rmse:.2f}")
    return {"mse": mse, "r2": r2, "mae": mae, "rmse": rmse}

Our Model is trained now we will test how good it is. 

We will use ***R-Squared*** to see how well the line fits the data.

We will use ***Mean Absolute Error*** to see how wrong the predictions are, on average.


# Lineare Regression

In [None]:
def run_linear_regression(x_train, y_train, x_test, y_test):
    # Create the model
    lr_model = LinearRegression()
    print("\n--- Linear Regression model created ---")

    # Train the Model
    lr_model.fit(x_train, y_train)
    print("\n--- Model training complete ---")

    # Make predictions on the test set
    y_pred = lr_model.predict(x_test)
    print("\n--- Predictions on test set complete ---")

    print("Linear Regression result evaluation: ")
    return evaluate_results(y_pred, y_test)


run_linear_regression(x_train, y_train, x_test, y_test)

# Decision Tree Regression

In [None]:
def run_decision_tree_regressor(x_train, y_train, x_test, y_test):
    # Create the model
    tree = DecisionTreeRegressor(random_state=42, max_depth=None,
                                 min_samples_split=2, min_samples_leaf=1, criterion='squared_error')
    print("\n--- Decision Tree Regressor (max_depth=None, min_samples_split=2, min_samples_leaf=1, criterion='squared_error') model created ---")

    # Train the Model
    tree.fit(x_train, y_train)
    print("\n--- Model training complete ---")

    # Make predictions on the test set
    y_pred = tree.predict(x_test)
    print("\n--- Predictions on test set complete ---")

    print("Decision Tree Regressor result evaluation: ")
    return evaluate_results(y_pred, y_test)


run_decision_tree_regressor(x_train, y_train, x_test, y_test)

# Desicion Tree Regression with Grid Search

In [None]:
def run_decision_tree_regressor_with_grid_search(x_train, y_train, x_test, y_test):
    # Define Hyperparameters
    param_grid = {
        'max_depth': [3, 5, 7, 10, None],  # None means no limit
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'criterion': ['squared_error', 'absolute_error']
    }

    # Create the model
    tree = DecisionTreeRegressor(random_state=42)

    # Set up Grid Search
    grid_search = GridSearchCV(
        estimator=tree,
        param_grid=param_grid,
        scoring='neg_mean_squared_error',
        cv=5,
        verbose=1,
        n_jobs=-1
    )

    # Train the model with Grid Search
    print("Starting Grid Search...")
    grid_search.fit(x_train, y_train)
    print("Grid Search complete.")

    best_parameters = grid_search.best_params_
    print(f"Best Hyperparameters: {best_parameters}")

    # Make predictions on the test set
    best_dt_model = grid_search.best_estimator_
    y_pred = best_dt_model.predict(x_test)

    print("\n--- Predictions on test set complete ---")
    print("Tuned Decision Tree Regressor result evaluation: ")
    return evaluate_results(y_pred, y_test)


run_decision_tree_regressor_with_grid_search(x_train, y_train, x_test, y_test)

# Gradient Boost Regression

In [None]:
def run_gradient_boosting_regressor(x_train, y_train, x_test, y_test):
    # Create the model
    gbm = GradientBoostingRegressor(
        n_estimators=100, learning_rate=0.1, random_state=42)
    print("\n--- Gradient Boosting Regressor (n_estimators=100, learning_rate=0.1, random_state=42) model created ---")

    # Train the Model
    gbm.fit(x_train, y_train)
    print("\n--- Model training complete ---")

    # Make predictions on the test set
    y_pred = gbm.predict(x_test)

    print("\n--- Predictions on test set complete ---")

    print("Gradient Boosting Regressor result evaluation: ")
    return evaluate_results(y_pred, y_test)


run_gradient_boosting_regressor(x_train, y_train, x_test, y_test)

# Gradient Boost Regression with Grid Search

In [None]:
def run_gradient_boosting_regressor_with_grid_search(x_train, y_train, x_test, y_test):
    # Define Hyperparameters
    param_grid_gbr = {
        'n_estimators': [100, 300, 500],
        'learning_rate': [0.01, 0.05, 0.1],
        'subsample': [0.8, 1.0],
        'max_depth': [3, 5, 7],
        'min_samples_split': [5, 10],
        'min_samples_leaf': [3, 5],
    }

    # Create the model
    gbm = GradientBoostingRegressor(random_state=42)

    # Set up Grid Search
    grid_search = GridSearchCV(
        estimator=gbm,
        param_grid=param_grid_gbr,
        scoring='neg_mean_squared_error',
        cv=5,
        verbose=1,
        n_jobs=-1
    )

    # Train the model with Grid Search
    print("Starting Grid Search...")
    grid_search.fit(x_train, y_train)
    print("Grid Search complete.")

    best_parameters = grid_search.best_params_
    print(f"Best Hyperparameters: {best_parameters}")

    best_dt_model = grid_search.best_estimator_
    y_pred = best_dt_model.predict(x_test)

    print("\n--- Predictions on test set complete ---")
    print("Tuned Gradient Boosting Regressor result evaluation: ")
    return evaluate_results(y_pred, y_test)


run_gradient_boosting_regressor_with_grid_search(
    x_train, y_train, x_test, y_test)

# Compare all models

In [None]:

linear_regession_results = run_linear_regression(
    x_train, y_train, x_test, y_test)
desicion_tree_results = run_decision_tree_regressor(
    x_train, y_train, x_test, y_test)
tuned_decision_tree_results = run_decision_tree_regressor_with_grid_search(
    x_train, y_train, x_test, y_test)
gradient_boosting_results = run_gradient_boosting_regressor(
    x_train, y_train, x_test, y_test)
tuned_gradient_boosting_results = run_gradient_boosting_regressor_with_grid_search(
    x_train, y_train, x_test, y_test)

results_data = {
    'Linear Regression': linear_regession_results,
    'Decision Tree': desicion_tree_results,
    'Tuned Decision Tree': tuned_decision_tree_results,
    'Gradient Boosting': gradient_boosting_results,
    'Tuned Gradient Boosting': tuned_gradient_boosting_results
}ß

df_results = pd.DataFrame(results_data).T.sort_values(by='r2', ascending=False)
df_results

In [None]:
plt.figure(figsize=(10, 6))
df_results['r2'].sort_values(ascending=False).plot(kind='bar', color='skyblue')
plt.title('$R^2$ Score Comparison Across Models', fontsize=14)
plt.ylabel('$R^2$ Score')
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
df_results['rmse'].sort_values(ascending=True).plot(kind='bar', color='salmon')
plt.title('RMSE Comparison Across Models', fontsize=14)
plt.ylabel('Root Mean Squared Error (RMSE)')
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

### Analysis & Interpretation:

The model performed very well, explaining 92.2% of the variance in calories burned (R² = 0.92). This confirms our selected features are highly predictive.

**The Mean Absolute Error (MAE)** shows that, on average, the model's prediction is off by approximately 73 calories.

The most important insight comes from comparing the RMSE (123.76) to the MAE (73.45). The RMSE is significantly larger, which strongly indicates that the model has a problem with outliers. While most predictions are good (off by 73), a few predictions are very wrong, and the RMSE (which penalizes large errors) is highlighting this.