cds-1011-health-data-machin.../ml_calories.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "91deb647",
   "metadata": {},
   "source": [
    "# EDA"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3d701954",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Imports\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.linear_model import LinearRegression\n",
    "from sklearn.compose import ColumnTransformer\n",
    "from sklearn.preprocessing import StandardScaler, OneHotEncoder\n",
    "from sklearn.metrics import mean_squared_error, r2_score\n",
    "from sklearn.tree import DecisionTreeRegressor\n",
    "from sklearn.model_selection import GridSearchCV\n",
    "from sklearn.ensemble import GradientBoostingRegressor"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "255f60b8",
   "metadata": {},
   "outputs": [],
   "source": [
    "# import the data\n",
    "cols = {\n",
    "    \"Aktivitätstyp\": \"activity_type\",\n",
    "    \"Distanz\": \"distance_km\",\n",
    "    \"Anstieg gesamt\": \"elevation_meters\",\n",
    "    \"Kalorien\": \"calories_burned\",\n",
    "    \"Zeit\": \"duration_str\",\n",
    "    \"Ø Herzfrequenz\": \"heart_rate\"\n",
    "}\n",
    "\n",
    "data = pd.read_csv(\n",
    "    \"data/raw/all_activities.csv\",\n",
    "    usecols=list(cols.keys())\n",
    ")\n",
    "data.rename(columns=cols, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f37051fe",
   "metadata": {},
   "outputs": [],
   "source": [
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "785d030f",
   "metadata": {},
   "outputs": [],
   "source": [
    "data.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9897a6e2",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Data Preprocessing\n",
    "# Check for missing values\n",
    "data.isnull().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2ada8082",
   "metadata": {},
   "outputs": [],
   "source": [
    "# change duration to seconds\n",
    "data[\"duration_seconds\"] = pd.to_timedelta(\n",
    "    data[\"duration_str\"]).dt.total_seconds()\n",
    "data.drop(columns=[\"duration_str\"], inplace=True)\n",
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d67708c7",
   "metadata": {},
   "outputs": [],
   "source": [
    "# get rid of commas in Distance and Calories Burned columns and convert to numeric\n",
    "data['distance_km'] = pd.to_numeric(\n",
    "    data['distance_km'].str.replace(',', ''), errors='coerce')\n",
    "data['elevation_meters'] = pd.to_numeric(\n",
    "    data['elevation_meters'].str.replace(',', ''), errors='coerce')\n",
    "data['calories_burned'] = pd.to_numeric(\n",
    "    data['calories_burned'].str.replace(',', ''), errors='coerce')\n",
    "data['heart_rate'] = pd.to_numeric(\n",
    "    data['heart_rate'].str.replace(',', ''), errors='coerce')\n",
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c4c14efa",
   "metadata": {},
   "outputs": [],
   "source": [
    "# check data types\n",
    "data.dtypes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ac7a0f36",
   "metadata": {},
   "outputs": [],
   "source": [
    "# how many none values are there now\n",
    "data.isnull().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3676a6cc",
   "metadata": {},
   "outputs": [],
   "source": [
    "# drop rows with missing values\n",
    "data = data.dropna()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9b4e6146",
   "metadata": {},
   "outputs": [],
   "source": [
    "# how many \"--\" in heart rate now\n",
    "data['heart_rate'].value_counts().get('--', 0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d84e8f3b",
   "metadata": {},
   "outputs": [],
   "source": [
    "# drop rows with \"--\" in Heart Rate\n",
    "data = data[data['heart_rate'] != '--']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "67785fce",
   "metadata": {},
   "outputs": [],
   "source": [
    "data.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "06b0bf1d",
   "metadata": {},
   "outputs": [],
   "source": [
    "# change dtype of Duration (seconds), Calories Burned and Heart Rate to integer\n",
    "data[\"duration_seconds\"] = data[\"duration_seconds\"].astype(int)\n",
    "data[\"calories_burned\"] = data[\"calories_burned\"].astype(int)\n",
    "data[\"heart_rate\"] = data[\"heart_rate\"].astype(int)\n",
    "data.dtypes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "414362eb",
   "metadata": {},
   "outputs": [],
   "source": [
    "# save the cleaned data to a new csv file\n",
    "data.to_csv('data/cleaned/cleaned_activities.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2474dbe2",
   "metadata": {},
   "outputs": [],
   "source": [
    "# how many unique activity types are there\n",
    "data['activity_type'].nunique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b99e6f62",
   "metadata": {},
   "outputs": [],
   "source": [
    "# check for rare categories in Activity Type\n",
    "print(data['activity_type'].value_counts())"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5e2ec61f",
   "metadata": {},
   "source": [
    "At this point we see there are 14 types of activities. To simplfy we can group some of similar activities. \n",
    "\n",
    "Gehen, Wandern --> Gehen\n",
    "Indoor Cycling, Rennradfahren, Virtuelles Radfahren, Radfahren--> Radfahren\n",
    "\n",
    "Since there are for Crosstrainer 3, Skifahren 2 and Stepper 1 rows it is better to drop these rows because our model cannot learn from so few examples."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b9e1803b",
   "metadata": {},
   "outputs": [],
   "source": [
    "# group similar activities\n",
    "data['activity_type'] = data['activity_type'].replace({\n",
    "    'Wandern': 'Gehen',\n",
    "    'Indoor Cycling': 'Radfahren',\n",
    "    'Rennradfahren': 'Radfahren',\n",
    "    'Virtuelles Radfahren': 'Radfahren',\n",
    "    'Laufbandtraining': 'Laufen'\n",
    "})\n",
    "\n",
    "# drop the rare activity types\n",
    "rare_activities = ['Crosstrainer', 'Skifahren (Piste)', 'Stepper']\n",
    "data = data[~data['activity_type'].isin(rare_activities)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c265f4b2",
   "metadata": {},
   "outputs": [],
   "source": [
    "# checking the final counts of activity types\n",
    "print(data['activity_type'].value_counts())"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "20d39609",
   "metadata": {},
   "source": [
    "Now we have 7 Activity Types with occurences varying from 12 to 183. "
   ]
  },
  {
   "cell_type": "markdown",
   "id": "53d208e4",
   "metadata": {},
   "source": [
    "# Prepare training and test sets"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "07cc0009",
   "metadata": {},
   "source": [
    "Our data is cleaned and ready. \n",
    "Our first column \"Activity Type\" is string and we will use \"One Hot Encoding\" to involve it to our linear regression model."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a165c69e",
   "metadata": {},
   "outputs": [],
   "source": [
    "# define features and target variable\n",
    "features = ['activity_type', 'distance_km',\n",
    "            'elevation_meters', 'duration_seconds', 'heart_rate']\n",
    "target = 'calories_burned'\n",
    "\n",
    "x = data[features]\n",
    "y = data[target]\n",
    "\n",
    "# split the data into training and testing sets\n",
    "x_train, x_test, y_train, y_test = train_test_split(\n",
    "    x, y, test_size=0.2, random_state=42)\n",
    "print(\n",
    "    f\"Data split into {x_train.shape[0]} training rows and {x_test.shape[0]} testing rows.\")\n",
    "\n",
    "# Preprocessing: Scaling numeric features and encoding categorical features\n",
    "numeric_features = ['distance_km', 'elevation_meters',\n",
    "                    'duration_seconds', 'heart_rate']\n",
    "categorical_features = ['activity_type']\n",
    "preprocessor = ColumnTransformer(\n",
    "    transformers=[\n",
    "        ('num', StandardScaler(), numeric_features),\n",
    "        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)\n",
    "    ]\n",
    ")\n",
    "\n",
    "x_train = preprocessor.fit_transform(x_train)\n",
    "x_test = preprocessor.transform(x_test)\n",
    "\n",
    "print(\"\\n--- Preprocessing complete ---\")\n",
    "print(\"Shape of processed training data:\", x_train.shape)\n",
    "print(\"Shape of processed testing data:\", x_test.shape)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6ec79f7b",
   "metadata": {},
   "source": [
    "# Correlation Matrix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b1bef2f0",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "correlation_matrix = data.corr(numeric_only=True, method='pearson')\n",
    "\n",
    "plt.figure(figsize=(12, 10))\n",
    "sns.heatmap(correlation_matrix, annot=True, cmap=\"coolwarm\", center=0)\n",
    "plt.title(\"Korrelationsmatrix der numerischen Merkmale (Pearson)\")\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cb437880",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Evaluation Function\n",
    "def evaluate_results(y_pred, y_true):\n",
    "    mse = mean_squared_error(y_true, y_pred)\n",
    "    r2 = r2_score(y_true, y_pred)\n",
    "    mae = np.mean(np.abs(y_true - y_pred))\n",
    "    rmse = np.sqrt(mse)\n",
    "    print(f\"Mean Squared Error: {mse:.2f}\")\n",
    "    print(f\"R^2 Score: {r2:.2f}\")\n",
    "    print(f\"Mean Absolute Error: {mae:.2f}\")\n",
    "    print(f\"Root Mean Squared Error: {rmse:.2f}\")\n",
    "    return {\"mse\": mse, \"r2\": r2, \"mae\": mae, \"rmse\": rmse}"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f79e360a",
   "metadata": {},
   "source": [
    "Our Model is trained now we will test how good it is. \n",
    "\n",
    "We will use ***R-Squared*** to see how well the line fits the data.\n",
    "\n",
    "We will use ***Mean Absolute Error*** to see how wrong the predictions are, on average.\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f8273d79",
   "metadata": {},
   "source": [
    "# Lineare Regression"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e0841794",
   "metadata": {},
   "outputs": [],
   "source": [
    "def run_linear_regression(x_train, y_train, x_test, y_test):\n",
    "    # Create the model\n",
    "    lr_model = LinearRegression()\n",
    "    print(\"\\n--- Linear Regression model created ---\")\n",
    "\n",
    "    # Train the Model\n",
    "    lr_model.fit(x_train, y_train)\n",
    "    print(\"\\n--- Model training complete ---\")\n",
    "\n",
    "    # Make predictions on the test set\n",
    "    y_pred = lr_model.predict(x_test)\n",
    "    print(\"\\n--- Predictions on test set complete ---\")\n",
    "\n",
    "    print(\"Linear Regression result evaluation: \")\n",
    "    return evaluate_results(y_pred, y_test)\n",
    "\n",
    "\n",
    "run_linear_regression(x_train, y_train, x_test, y_test)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4ec3f04a",
   "metadata": {},
   "source": [
    "# Decision Tree Regression"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f1c72308",
   "metadata": {},
   "outputs": [],
   "source": [
    "def run_decision_tree_regressor(x_train, y_train, x_test, y_test):\n",
    "    # Create the model\n",
    "    tree = DecisionTreeRegressor(random_state=42, max_depth=None,\n",
    "                                 min_samples_split=2, min_samples_leaf=1, criterion='squared_error')\n",
    "    print(\"\\n--- Decision Tree Regressor (max_depth=None, min_samples_split=2, min_samples_leaf=1, criterion='squared_error') model created ---\")\n",
    "\n",
    "    # Train the Model\n",
    "    tree.fit(x_train, y_train)\n",
    "    print(\"\\n--- Model training complete ---\")\n",
    "\n",
    "    # Make predictions on the test set\n",
    "    y_pred = tree.predict(x_test)\n",
    "    print(\"\\n--- Predictions on test set complete ---\")\n",
    "\n",
    "    print(\"Decision Tree Regressor result evaluation: \")\n",
    "    return evaluate_results(y_pred, y_test)\n",
    "\n",
    "\n",
    "run_decision_tree_regressor(x_train, y_train, x_test, y_test)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3e161de4",
   "metadata": {},
   "source": [
    "# Desicion Tree Regression with Grid Search"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fd75750e",
   "metadata": {},
   "outputs": [],
   "source": [
    "def run_decision_tree_regressor_with_grid_search(x_train, y_train, x_test, y_test):\n",
    "    # Define Hyperparameters\n",
    "    param_grid = {\n",
    "        'max_depth': [3, 5, 7, 10, None],  # None means no limit\n",
    "        'min_samples_split': [2, 5, 10],\n",
    "        'min_samples_leaf': [1, 2, 4],\n",
    "        'criterion': ['squared_error', 'absolute_error']\n",
    "    }\n",
    "\n",
    "    # Create the model\n",
    "    tree = DecisionTreeRegressor(random_state=42)\n",
    "\n",
    "    # Set up Grid Search\n",
    "    grid_search = GridSearchCV(\n",
    "        estimator=tree,\n",
    "        param_grid=param_grid,\n",
    "        scoring='neg_mean_squared_error',\n",
    "        cv=5,\n",
    "        verbose=1,\n",
    "        n_jobs=-1\n",
    "    )\n",
    "\n",
    "    # Train the model with Grid Search\n",
    "    print(\"Starting Grid Search...\")\n",
    "    grid_search.fit(x_train, y_train)\n",
    "    print(\"Grid Search complete.\")\n",
    "\n",
    "    best_parameters = grid_search.best_params_\n",
    "    print(f\"Best Hyperparameters: {best_parameters}\")\n",
    "\n",
    "    # Make predictions on the test set\n",
    "    best_dt_model = grid_search.best_estimator_\n",
    "    y_pred = best_dt_model.predict(x_test)\n",
    "\n",
    "    print(\"\\n--- Predictions on test set complete ---\")\n",
    "    print(\"Tuned Decision Tree Regressor result evaluation: \")\n",
    "    return evaluate_results(y_pred, y_test)\n",
    "\n",
    "\n",
    "run_decision_tree_regressor_with_grid_search(x_train, y_train, x_test, y_test)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7cec34c4",
   "metadata": {},
   "source": [
    "# Gradient Boost Regression"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8d6830ce",
   "metadata": {},
   "outputs": [],
   "source": [
    "def run_gradient_boosting_regressor(x_train, y_train, x_test, y_test):\n",
    "    # Create the model\n",
    "    gbm = GradientBoostingRegressor(\n",
    "        n_estimators=100, learning_rate=0.1, random_state=42)\n",
    "    print(\"\\n--- Gradient Boosting Regressor (n_estimators=100, learning_rate=0.1, random_state=42) model created ---\")\n",
    "\n",
    "    # Train the Model\n",
    "    gbm.fit(x_train, y_train)\n",
    "    print(\"\\n--- Model training complete ---\")\n",
    "\n",
    "    # Make predictions on the test set\n",
    "    y_pred = gbm.predict(x_test)\n",
    "\n",
    "    print(\"\\n--- Predictions on test set complete ---\")\n",
    "\n",
    "    print(\"Gradient Boosting Regressor result evaluation: \")\n",
    "    return evaluate_results(y_pred, y_test)\n",
    "\n",
    "\n",
    "run_gradient_boosting_regressor(x_train, y_train, x_test, y_test)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "21327986",
   "metadata": {},
   "source": [
    "# Gradient Boost Regression with Grid Search"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "60bd449d",
   "metadata": {},
   "outputs": [],
   "source": [
    "def run_gradient_boosting_regressor_with_grid_search(x_train, y_train, x_test, y_test):\n",
    "    # Define Hyperparameters\n",
    "    param_grid_gbr = {\n",
    "        'n_estimators': [100, 300, 500],\n",
    "        'learning_rate': [0.01, 0.05, 0.1],\n",
    "        'subsample': [0.8, 1.0],\n",
    "        'max_depth': [3, 5, 7],\n",
    "        'min_samples_split': [5, 10],\n",
    "        'min_samples_leaf': [3, 5],\n",
    "    }\n",
    "\n",
    "    # Create the model\n",
    "    gbm = GradientBoostingRegressor(random_state=42)\n",
    "\n",
    "    # Set up Grid Search\n",
    "    grid_search = GridSearchCV(\n",
    "        estimator=gbm,\n",
    "        param_grid=param_grid_gbr,\n",
    "        scoring='neg_mean_squared_error',\n",
    "        cv=5,\n",
    "        verbose=1,\n",
    "        n_jobs=-1\n",
    "    )\n",
    "\n",
    "    # Train the model with Grid Search\n",
    "    print(\"Starting Grid Search...\")\n",
    "    grid_search.fit(x_train, y_train)\n",
    "    print(\"Grid Search complete.\")\n",
    "\n",
    "    best_parameters = grid_search.best_params_\n",
    "    print(f\"Best Hyperparameters: {best_parameters}\")\n",
    "\n",
    "    best_dt_model = grid_search.best_estimator_\n",
    "    y_pred = best_dt_model.predict(x_test)\n",
    "\n",
    "    print(\"\\n--- Predictions on test set complete ---\")\n",
    "    print(\"Tuned Gradient Boosting Regressor result evaluation: \")\n",
    "    return evaluate_results(y_pred, y_test)\n",
    "\n",
    "\n",
    "run_gradient_boosting_regressor_with_grid_search(\n",
    "    x_train, y_train, x_test, y_test)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5cff7679",
   "metadata": {},
   "source": [
    "# Compare all models"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2dc752bf",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "linear_regession_results = run_linear_regression(\n",
    "    x_train, y_train, x_test, y_test)\n",
    "desicion_tree_results = run_decision_tree_regressor(\n",
    "    x_train, y_train, x_test, y_test)\n",
    "tuned_decision_tree_results = run_decision_tree_regressor_with_grid_search(\n",
    "    x_train, y_train, x_test, y_test)\n",
    "gradient_boosting_results = run_gradient_boosting_regressor(\n",
    "    x_train, y_train, x_test, y_test)\n",
    "tuned_gradient_boosting_results = run_gradient_boosting_regressor_with_grid_search(\n",
    "    x_train, y_train, x_test, y_test)\n",
    "\n",
    "results_data = {\n",
    "    'Linear Regression': linear_regession_results,\n",
    "    'Decision Tree': desicion_tree_results,\n",
    "    'Tuned Decision Tree': tuned_decision_tree_results,\n",
    "    'Gradient Boosting': gradient_boosting_results,\n",
    "    'Tuned Gradient Boosting': tuned_gradient_boosting_results\n",
    "}\n",
    "\n",
    "df_results = pd.DataFrame(results_data).T.sort_values(by='r2', ascending=False)\n",
    "df_results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "06b46bdd",
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.figure(figsize=(10, 6))\n",
    "df_results['r2'].sort_values(ascending=False).plot(kind='bar', color='skyblue')\n",
    "plt.title('$R^2$ Score Comparison Across Models', fontsize=14)\n",
    "plt.ylabel('$R^2$ Score')\n",
    "plt.xticks(rotation=45, ha='right')\n",
    "plt.grid(axis='y', linestyle='--', alpha=0.7)\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e3ef5182",
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.figure(figsize=(10, 6))\n",
    "df_results['rmse'].sort_values(ascending=True).plot(kind='bar', color='salmon')\n",
    "plt.title('RMSE Comparison Across Models', fontsize=14)\n",
    "plt.ylabel('Root Mean Squared Error (RMSE)')\n",
    "plt.xticks(rotation=45, ha='right')\n",
    "plt.grid(axis='y', linestyle='--', alpha=0.7)\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a77d08df",
   "metadata": {},
   "source": [
    "### Analysis & Interpretation:\n",
    "\n",
    "The model performed very well, explaining 92.2% of the variance in calories burned (R² = 0.92). This confirms our selected features are highly predictive.\n",
    "\n",
    "**The Mean Absolute Error (MAE)** shows that, on average, the model's prediction is off by approximately 73 calories.\n",
    "\n",
    "The most important insight comes from comparing the RMSE (123.76) to the MAE (73.45). The RMSE is significantly larger, which strongly indicates that the model has a problem with outliers. While most predictions are good (off by 73), a few predictions are very wrong, and the RMSE (which penalizes large errors) is highlighting this."
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "base",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}