734 lines
21 KiB
Plaintext
734 lines
21 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "91deb647",
|
|
"metadata": {},
|
|
"source": [
|
|
"# EDA"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "3d701954",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Imports\n",
|
|
"import pandas as pd\n",
|
|
"import numpy as np\n",
|
|
"import matplotlib.pyplot as plt\n",
|
|
"import seaborn as sns\n",
|
|
"from sklearn.model_selection import train_test_split\n",
|
|
"from sklearn.linear_model import LinearRegression\n",
|
|
"from sklearn.compose import ColumnTransformer\n",
|
|
"from sklearn.preprocessing import StandardScaler, OneHotEncoder\n",
|
|
"from sklearn.metrics import mean_squared_error, r2_score\n",
|
|
"from sklearn.tree import DecisionTreeRegressor\n",
|
|
"from sklearn.model_selection import GridSearchCV\n",
|
|
"from sklearn.ensemble import GradientBoostingRegressor"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "255f60b8",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# import the data\n",
|
|
"cols = {\n",
|
|
" \"Aktivitätstyp\": \"activity_type\",\n",
|
|
" \"Distanz\": \"distance_km\",\n",
|
|
" \"Anstieg gesamt\": \"elevation_meters\",\n",
|
|
" \"Kalorien\": \"calories_burned\",\n",
|
|
" \"Zeit\": \"duration_str\",\n",
|
|
" \"Ø Herzfrequenz\": \"heart_rate\"\n",
|
|
"}\n",
|
|
"\n",
|
|
"data = pd.read_csv(\n",
|
|
" \"data/raw/all_activities.csv\",\n",
|
|
" usecols=list(cols.keys())\n",
|
|
")\n",
|
|
"data.rename(columns=cols, inplace=True)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "f37051fe",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"data.head()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "785d030f",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"data.shape"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "9897a6e2",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Data Preprocessing\n",
|
|
"# Check for missing values\n",
|
|
"data.isnull().sum()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "2ada8082",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# change duration to seconds\n",
|
|
"data[\"duration_seconds\"] = pd.to_timedelta(\n",
|
|
" data[\"duration_str\"]).dt.total_seconds()\n",
|
|
"data.drop(columns=[\"duration_str\"], inplace=True)\n",
|
|
"data.head()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "d67708c7",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# get rid of commas in Distance and Calories Burned columns and convert to numeric\n",
|
|
"data['distance_km'] = pd.to_numeric(\n",
|
|
" data['distance_km'].str.replace(',', ''), errors='coerce')\n",
|
|
"data['elevation_meters'] = pd.to_numeric(\n",
|
|
" data['elevation_meters'].str.replace(',', ''), errors='coerce')\n",
|
|
"data['calories_burned'] = pd.to_numeric(\n",
|
|
" data['calories_burned'].str.replace(',', ''), errors='coerce')\n",
|
|
"data['heart_rate'] = pd.to_numeric(\n",
|
|
" data['heart_rate'].str.replace(',', ''), errors='coerce')\n",
|
|
"data.head()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "c4c14efa",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# check data types\n",
|
|
"data.dtypes"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "ac7a0f36",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# how many none values are there now\n",
|
|
"data.isnull().sum()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "3676a6cc",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# drop rows with missing values\n",
|
|
"data = data.dropna()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "9b4e6146",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# how many \"--\" in heart rate now\n",
|
|
"data['heart_rate'].value_counts().get('--', 0)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "d84e8f3b",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# drop rows with \"--\" in Heart Rate\n",
|
|
"data = data[data['heart_rate'] != '--']"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "67785fce",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"data.shape"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "06b0bf1d",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# change dtype of Duration (seconds), Calories Burned and Heart Rate to integer\n",
|
|
"data[\"duration_seconds\"] = data[\"duration_seconds\"].astype(int)\n",
|
|
"data[\"calories_burned\"] = data[\"calories_burned\"].astype(int)\n",
|
|
"data[\"heart_rate\"] = data[\"heart_rate\"].astype(int)\n",
|
|
"data.dtypes"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "414362eb",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# save the cleaned data to a new csv file\n",
|
|
"data.to_csv('data/cleaned/cleaned_activities.csv', index=False)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "2474dbe2",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# how many unique activity types are there\n",
|
|
"data['activity_type'].nunique()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "b99e6f62",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# check for rare categories in Activity Type\n",
|
|
"print(data['activity_type'].value_counts())"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "5e2ec61f",
|
|
"metadata": {},
|
|
"source": [
|
|
"At this point we see there are 14 types of activities. To simplfy we can group some of similar activities. \n",
|
|
"\n",
|
|
"Gehen, Wandern --> Gehen\n",
|
|
"Indoor Cycling, Rennradfahren, Virtuelles Radfahren, Radfahren--> Radfahren\n",
|
|
"\n",
|
|
"Since there are for Crosstrainer 3, Skifahren 2 and Stepper 1 rows it is better to drop these rows because our model cannot learn from so few examples."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "b9e1803b",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# group similar activities\n",
|
|
"data['activity_type'] = data['activity_type'].replace({\n",
|
|
" 'Wandern': 'Gehen',\n",
|
|
" 'Indoor Cycling': 'Radfahren',\n",
|
|
" 'Rennradfahren': 'Radfahren',\n",
|
|
" 'Virtuelles Radfahren': 'Radfahren',\n",
|
|
" 'Laufbandtraining': 'Laufen'\n",
|
|
"})\n",
|
|
"\n",
|
|
"# drop the rare activity types\n",
|
|
"rare_activities = ['Crosstrainer', 'Skifahren (Piste)', 'Stepper']\n",
|
|
"data = data[~data['activity_type'].isin(rare_activities)]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "c265f4b2",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# checking the final counts of activity types\n",
|
|
"print(data['activity_type'].value_counts())"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "20d39609",
|
|
"metadata": {},
|
|
"source": [
|
|
"Now we have 7 Activity Types with occurences varying from 12 to 183. "
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "53d208e4",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Prepare training and test sets"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "07cc0009",
|
|
"metadata": {},
|
|
"source": [
|
|
"Our data is cleaned and ready. \n",
|
|
"Our first column \"Activity Type\" is string and we will use \"One Hot Encoding\" to involve it to our linear regression model."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "a165c69e",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# define features and target variable\n",
|
|
"features = ['activity_type', 'distance_km',\n",
|
|
" 'elevation_meters', 'duration_seconds', 'heart_rate']\n",
|
|
"target = 'calories_burned'\n",
|
|
"\n",
|
|
"x = data[features]\n",
|
|
"y = data[target]\n",
|
|
"\n",
|
|
"# split the data into training and testing sets\n",
|
|
"x_train, x_test, y_train, y_test = train_test_split(\n",
|
|
" x, y, test_size=0.2, random_state=42)\n",
|
|
"print(\n",
|
|
" f\"Data split into {x_train.shape[0]} training rows and {x_test.shape[0]} testing rows.\")\n",
|
|
"\n",
|
|
"# Preprocessing: Scaling numeric features and encoding categorical features\n",
|
|
"numeric_features = ['distance_km', 'elevation_meters',\n",
|
|
" 'duration_seconds', 'heart_rate']\n",
|
|
"categorical_features = ['activity_type']\n",
|
|
"preprocessor = ColumnTransformer(\n",
|
|
" transformers=[\n",
|
|
" ('num', StandardScaler(), numeric_features),\n",
|
|
" ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)\n",
|
|
" ]\n",
|
|
")\n",
|
|
"\n",
|
|
"x_train = preprocessor.fit_transform(x_train)\n",
|
|
"x_test = preprocessor.transform(x_test)\n",
|
|
"\n",
|
|
"print(\"\\n--- Preprocessing complete ---\")\n",
|
|
"print(\"Shape of processed training data:\", x_train.shape)\n",
|
|
"print(\"Shape of processed testing data:\", x_test.shape)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "6ec79f7b",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Correlation Matrix"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "b1bef2f0",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"\n",
|
|
"correlation_matrix = data.corr(numeric_only=True, method='pearson')\n",
|
|
"\n",
|
|
"plt.figure(figsize=(12, 10))\n",
|
|
"sns.heatmap(correlation_matrix, annot=True, cmap=\"coolwarm\", center=0)\n",
|
|
"plt.title(\"Korrelationsmatrix der numerischen Merkmale (Pearson)\")\n",
|
|
"plt.show()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "cb437880",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Evaluation Function\n",
|
|
"def evaluate_results(y_pred, y_true):\n",
|
|
" mse = mean_squared_error(y_true, y_pred)\n",
|
|
" r2 = r2_score(y_true, y_pred)\n",
|
|
" mae = np.mean(np.abs(y_true - y_pred))\n",
|
|
" rmse = np.sqrt(mse)\n",
|
|
" print(f\"Mean Squared Error: {mse:.2f}\")\n",
|
|
" print(f\"R^2 Score: {r2:.2f}\")\n",
|
|
" print(f\"Mean Absolute Error: {mae:.2f}\")\n",
|
|
" print(f\"Root Mean Squared Error: {rmse:.2f}\")\n",
|
|
" return {\"mse\": mse, \"r2\": r2, \"mae\": mae, \"rmse\": rmse}"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "f79e360a",
|
|
"metadata": {},
|
|
"source": [
|
|
"Our Model is trained now we will test how good it is. \n",
|
|
"\n",
|
|
"We will use ***R-Squared*** to see how well the line fits the data.\n",
|
|
"\n",
|
|
"We will use ***Mean Absolute Error*** to see how wrong the predictions are, on average.\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "f8273d79",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Lineare Regression"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "e0841794",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def run_linear_regression(x_train, y_train, x_test, y_test):\n",
|
|
" # Create the model\n",
|
|
" lr_model = LinearRegression()\n",
|
|
" print(\"\\n--- Linear Regression model created ---\")\n",
|
|
"\n",
|
|
" # Train the Model\n",
|
|
" lr_model.fit(x_train, y_train)\n",
|
|
" print(\"\\n--- Model training complete ---\")\n",
|
|
"\n",
|
|
" # Make predictions on the test set\n",
|
|
" y_pred = lr_model.predict(x_test)\n",
|
|
" print(\"\\n--- Predictions on test set complete ---\")\n",
|
|
"\n",
|
|
" print(\"Linear Regression result evaluation: \")\n",
|
|
" return evaluate_results(y_pred, y_test)\n",
|
|
"\n",
|
|
"\n",
|
|
"run_linear_regression(x_train, y_train, x_test, y_test)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "4ec3f04a",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Decision Tree Regression"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "f1c72308",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def run_decision_tree_regressor(x_train, y_train, x_test, y_test):\n",
|
|
" # Create the model\n",
|
|
" tree = DecisionTreeRegressor(random_state=42, max_depth=None,\n",
|
|
" min_samples_split=2, min_samples_leaf=1, criterion='squared_error')\n",
|
|
" print(\"\\n--- Decision Tree Regressor (max_depth=None, min_samples_split=2, min_samples_leaf=1, criterion='squared_error') model created ---\")\n",
|
|
"\n",
|
|
" # Train the Model\n",
|
|
" tree.fit(x_train, y_train)\n",
|
|
" print(\"\\n--- Model training complete ---\")\n",
|
|
"\n",
|
|
" # Make predictions on the test set\n",
|
|
" y_pred = tree.predict(x_test)\n",
|
|
" print(\"\\n--- Predictions on test set complete ---\")\n",
|
|
"\n",
|
|
" print(\"Decision Tree Regressor result evaluation: \")\n",
|
|
" return evaluate_results(y_pred, y_test)\n",
|
|
"\n",
|
|
"\n",
|
|
"run_decision_tree_regressor(x_train, y_train, x_test, y_test)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "3e161de4",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Desicion Tree Regression with Grid Search"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "fd75750e",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def run_decision_tree_regressor_with_grid_search(x_train, y_train, x_test, y_test):\n",
|
|
" # Define Hyperparameters\n",
|
|
" param_grid = {\n",
|
|
" 'max_depth': [3, 5, 7, 10, None], # None means no limit\n",
|
|
" 'min_samples_split': [2, 5, 10],\n",
|
|
" 'min_samples_leaf': [1, 2, 4],\n",
|
|
" 'criterion': ['squared_error', 'absolute_error']\n",
|
|
" }\n",
|
|
"\n",
|
|
" # Create the model\n",
|
|
" tree = DecisionTreeRegressor(random_state=42)\n",
|
|
"\n",
|
|
" # Set up Grid Search\n",
|
|
" grid_search = GridSearchCV(\n",
|
|
" estimator=tree,\n",
|
|
" param_grid=param_grid,\n",
|
|
" scoring='neg_mean_squared_error',\n",
|
|
" cv=5,\n",
|
|
" verbose=1,\n",
|
|
" n_jobs=-1\n",
|
|
" )\n",
|
|
"\n",
|
|
" # Train the model with Grid Search\n",
|
|
" print(\"Starting Grid Search...\")\n",
|
|
" grid_search.fit(x_train, y_train)\n",
|
|
" print(\"Grid Search complete.\")\n",
|
|
"\n",
|
|
" best_parameters = grid_search.best_params_\n",
|
|
" print(f\"Best Hyperparameters: {best_parameters}\")\n",
|
|
"\n",
|
|
" # Make predictions on the test set\n",
|
|
" best_dt_model = grid_search.best_estimator_\n",
|
|
" y_pred = best_dt_model.predict(x_test)\n",
|
|
"\n",
|
|
" print(\"\\n--- Predictions on test set complete ---\")\n",
|
|
" print(\"Tuned Decision Tree Regressor result evaluation: \")\n",
|
|
" return evaluate_results(y_pred, y_test)\n",
|
|
"\n",
|
|
"\n",
|
|
"run_decision_tree_regressor_with_grid_search(x_train, y_train, x_test, y_test)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "7cec34c4",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Gradient Boost Regression"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "8d6830ce",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def run_gradient_boosting_regressor(x_train, y_train, x_test, y_test):\n",
|
|
" # Create the model\n",
|
|
" gbm = GradientBoostingRegressor(\n",
|
|
" n_estimators=100, learning_rate=0.1, random_state=42)\n",
|
|
" print(\"\\n--- Gradient Boosting Regressor (n_estimators=100, learning_rate=0.1, random_state=42) model created ---\")\n",
|
|
"\n",
|
|
" # Train the Model\n",
|
|
" gbm.fit(x_train, y_train)\n",
|
|
" print(\"\\n--- Model training complete ---\")\n",
|
|
"\n",
|
|
" # Make predictions on the test set\n",
|
|
" y_pred = gbm.predict(x_test)\n",
|
|
"\n",
|
|
" print(\"\\n--- Predictions on test set complete ---\")\n",
|
|
"\n",
|
|
" print(\"Gradient Boosting Regressor result evaluation: \")\n",
|
|
" return evaluate_results(y_pred, y_test)\n",
|
|
"\n",
|
|
"\n",
|
|
"run_gradient_boosting_regressor(x_train, y_train, x_test, y_test)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "21327986",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Gradient Boost Regression with Grid Search"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "60bd449d",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def run_gradient_boosting_regressor_with_grid_search(x_train, y_train, x_test, y_test):\n",
|
|
" # Define Hyperparameters\n",
|
|
" param_grid_gbr = {\n",
|
|
" 'n_estimators': [100, 300, 500],\n",
|
|
" 'learning_rate': [0.01, 0.05, 0.1],\n",
|
|
" 'subsample': [0.8, 1.0],\n",
|
|
" 'max_depth': [3, 5, 7],\n",
|
|
" 'min_samples_split': [5, 10],\n",
|
|
" 'min_samples_leaf': [3, 5],\n",
|
|
" }\n",
|
|
"\n",
|
|
" # Create the model\n",
|
|
" gbm = GradientBoostingRegressor(random_state=42)\n",
|
|
"\n",
|
|
" # Set up Grid Search\n",
|
|
" grid_search = GridSearchCV(\n",
|
|
" estimator=gbm,\n",
|
|
" param_grid=param_grid_gbr,\n",
|
|
" scoring='neg_mean_squared_error',\n",
|
|
" cv=5,\n",
|
|
" verbose=1,\n",
|
|
" n_jobs=-1\n",
|
|
" )\n",
|
|
"\n",
|
|
" # Train the model with Grid Search\n",
|
|
" print(\"Starting Grid Search...\")\n",
|
|
" grid_search.fit(x_train, y_train)\n",
|
|
" print(\"Grid Search complete.\")\n",
|
|
"\n",
|
|
" best_parameters = grid_search.best_params_\n",
|
|
" print(f\"Best Hyperparameters: {best_parameters}\")\n",
|
|
"\n",
|
|
" best_dt_model = grid_search.best_estimator_\n",
|
|
" y_pred = best_dt_model.predict(x_test)\n",
|
|
"\n",
|
|
" print(\"\\n--- Predictions on test set complete ---\")\n",
|
|
" print(\"Tuned Gradient Boosting Regressor result evaluation: \")\n",
|
|
" return evaluate_results(y_pred, y_test)\n",
|
|
"\n",
|
|
"\n",
|
|
"run_gradient_boosting_regressor_with_grid_search(\n",
|
|
" x_train, y_train, x_test, y_test)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "5cff7679",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Compare all models"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "2dc752bf",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"\n",
|
|
"linear_regession_results = run_linear_regression(\n",
|
|
" x_train, y_train, x_test, y_test)\n",
|
|
"desicion_tree_results = run_decision_tree_regressor(\n",
|
|
" x_train, y_train, x_test, y_test)\n",
|
|
"tuned_decision_tree_results = run_decision_tree_regressor_with_grid_search(\n",
|
|
" x_train, y_train, x_test, y_test)\n",
|
|
"gradient_boosting_results = run_gradient_boosting_regressor(\n",
|
|
" x_train, y_train, x_test, y_test)\n",
|
|
"tuned_gradient_boosting_results = run_gradient_boosting_regressor_with_grid_search(\n",
|
|
" x_train, y_train, x_test, y_test)\n",
|
|
"\n",
|
|
"results_data = {\n",
|
|
" 'Linear Regression': linear_regession_results,\n",
|
|
" 'Decision Tree': desicion_tree_results,\n",
|
|
" 'Tuned Decision Tree': tuned_decision_tree_results,\n",
|
|
" 'Gradient Boosting': gradient_boosting_results,\n",
|
|
" 'Tuned Gradient Boosting': tuned_gradient_boosting_results\n",
|
|
"}\n",
|
|
"\n",
|
|
"df_results = pd.DataFrame(results_data).T.sort_values(by='r2', ascending=False)\n",
|
|
"df_results"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "06b46bdd",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"plt.figure(figsize=(10, 6))\n",
|
|
"df_results['r2'].sort_values(ascending=False).plot(kind='bar', color='skyblue')\n",
|
|
"plt.title('$R^2$ Score Comparison Across Models', fontsize=14)\n",
|
|
"plt.ylabel('$R^2$ Score')\n",
|
|
"plt.xticks(rotation=45, ha='right')\n",
|
|
"plt.grid(axis='y', linestyle='--', alpha=0.7)\n",
|
|
"plt.tight_layout()\n",
|
|
"plt.show()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "e3ef5182",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"plt.figure(figsize=(10, 6))\n",
|
|
"df_results['rmse'].sort_values(ascending=True).plot(kind='bar', color='salmon')\n",
|
|
"plt.title('RMSE Comparison Across Models', fontsize=14)\n",
|
|
"plt.ylabel('Root Mean Squared Error (RMSE)')\n",
|
|
"plt.xticks(rotation=45, ha='right')\n",
|
|
"plt.grid(axis='y', linestyle='--', alpha=0.7)\n",
|
|
"plt.tight_layout()\n",
|
|
"plt.show()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "a77d08df",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Analysis & Interpretation:\n",
|
|
"\n",
|
|
"The model performed very well, explaining 92.2% of the variance in calories burned (R² = 0.92). This confirms our selected features are highly predictive.\n",
|
|
"\n",
|
|
"**The Mean Absolute Error (MAE)** shows that, on average, the model's prediction is off by approximately 73 calories.\n",
|
|
"\n",
|
|
"The most important insight comes from comparing the RMSE (123.76) to the MAE (73.45). The RMSE is significantly larger, which strongly indicates that the model has a problem with outliers. While most predictions are good (off by 73), a few predictions are very wrong, and the RMSE (which penalizes large errors) is highlighting this."
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "base",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.13.5"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|