1990 lines
74 KiB
Plaintext
1990 lines
74 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "91deb647",
|
|
"metadata": {
|
|
"id": "91deb647"
|
|
},
|
|
"source": [
|
|
"# EDA"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "3d701954",
|
|
"metadata": {
|
|
"id": "3d701954"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Imports\n",
|
|
"import pandas as pd\n",
|
|
"import numpy as np\n",
|
|
"import matplotlib.pyplot as plt\n",
|
|
"import seaborn as sns\n",
|
|
"from sklearn.model_selection import train_test_split\n",
|
|
"from sklearn.linear_model import LinearRegression\n",
|
|
"from sklearn.compose import ColumnTransformer\n",
|
|
"from sklearn.preprocessing import StandardScaler, OneHotEncoder\n",
|
|
"from sklearn.metrics import mean_squared_error, r2_score\n",
|
|
"from sklearn.tree import DecisionTreeRegressor\n",
|
|
"from sklearn.model_selection import GridSearchCV\n",
|
|
"from sklearn.ensemble import GradientBoostingRegressor\n",
|
|
"from xgboost import XGBRegressor"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "255f60b8",
|
|
"metadata": {
|
|
"id": "255f60b8"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# import the data\n",
|
|
"cols = {\n",
|
|
" \"Aktivitätstyp\": \"activity_type\",\n",
|
|
" \"Distanz\": \"distance_km\",\n",
|
|
" \"Anstieg gesamt\": \"elevation_meters\",\n",
|
|
" \"Kalorien\": \"calories_burned\",\n",
|
|
" \"Zeit\": \"duration_str\",\n",
|
|
" \"Ø Herzfrequenz\": \"heart_rate\"\n",
|
|
"}\n",
|
|
"\n",
|
|
"data = pd.read_csv(\n",
|
|
" \"data/raw/all_activities.csv\",\n",
|
|
" usecols=list(cols.keys())\n",
|
|
")\n",
|
|
"data.rename(columns=cols, inplace=True)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "f37051fe",
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 206
|
|
},
|
|
"id": "f37051fe",
|
|
"outputId": "048c5827-1775-4d14-99bd-a838bd418efa"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"data.head()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "785d030f",
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/"
|
|
},
|
|
"id": "785d030f",
|
|
"outputId": "f80f3fd6-03aa-41da-dde0-0a5cc7014801"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"data.shape"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "9897a6e2",
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 272
|
|
},
|
|
"id": "9897a6e2",
|
|
"outputId": "a840c101-7df7-4563-8d42-9ce518d64d3b"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Data Preprocessing\n",
|
|
"# Check for missing values\n",
|
|
"data.isnull().sum()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "2ada8082",
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 206
|
|
},
|
|
"id": "2ada8082",
|
|
"outputId": "14300a67-94c9-468b-ccb1-b80ea6ec4675"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# change duration to seconds\n",
|
|
"data[\"duration_seconds\"] = pd.to_timedelta(\n",
|
|
" data[\"duration_str\"]).dt.total_seconds()\n",
|
|
"data.drop(columns=[\"duration_str\"], inplace=True)\n",
|
|
"data.head()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "d67708c7",
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 206
|
|
},
|
|
"id": "d67708c7",
|
|
"outputId": "643c97b4-4983-49c7-80e1-748abef59da4"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# get rid of commas in Distance and Calories Burned columns and convert to numeric\n",
|
|
"data['distance_km'] = pd.to_numeric(\n",
|
|
" data['distance_km'].str.replace(',', ''), errors='coerce')\n",
|
|
"data['elevation_meters'] = pd.to_numeric(\n",
|
|
" data['elevation_meters'].str.replace(',', ''), errors='coerce')\n",
|
|
"data['calories_burned'] = pd.to_numeric(\n",
|
|
" data['calories_burned'].str.replace(',', ''), errors='coerce')\n",
|
|
"data['heart_rate'] = pd.to_numeric(\n",
|
|
" data['heart_rate'].str.replace(',', ''), errors='coerce')\n",
|
|
"data.head()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "c4c14efa",
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 272
|
|
},
|
|
"id": "c4c14efa",
|
|
"outputId": "2fb96fc5-c7d2-4272-b13c-6843296a2844"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# check data types\n",
|
|
"data.dtypes"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "ac7a0f36",
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 272
|
|
},
|
|
"id": "ac7a0f36",
|
|
"outputId": "8c0eb6a1-1baf-4906-dce2-18c7a7cb2c74"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# how many none values are there now\n",
|
|
"data.isnull().sum()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "IKt8zrBFq6ei",
|
|
"metadata": {
|
|
"id": "IKt8zrBFq6ei"
|
|
},
|
|
"source": [
|
|
"202 none values in elevetion_meters are too much to drop. Instead we will fill these nones with 0."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "P8hQwDSUUAU5",
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 420
|
|
},
|
|
"id": "P8hQwDSUUAU5",
|
|
"outputId": "c6447f35-fd2c-4e28-aae2-e07060e1292b"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# insert 0 for the none elevation_meters\n",
|
|
"data['elevation_meters'].fillna(0, inplace=True)\n",
|
|
"data.isnull().sum()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "3676a6cc",
|
|
"metadata": {
|
|
"id": "3676a6cc"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# drop rows with missing values\n",
|
|
"data = data.dropna()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "9b4e6146",
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/"
|
|
},
|
|
"id": "9b4e6146",
|
|
"outputId": "c9c9f5a3-d298-4f20-f3cf-4dcb9909ff7b"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# how many \"--\" in heart rate now\n",
|
|
"data['heart_rate'].value_counts().get('--', 0)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "d84e8f3b",
|
|
"metadata": {
|
|
"id": "d84e8f3b"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# drop rows with \"--\" in Heart Rate\n",
|
|
"data = data[data['heart_rate'] != '--']"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "67785fce",
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/"
|
|
},
|
|
"id": "67785fce",
|
|
"outputId": "b0703683-493d-4e0c-c559-d52fcbff17f6"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"data.shape"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "06b0bf1d",
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 272
|
|
},
|
|
"id": "06b0bf1d",
|
|
"outputId": "ec4a98b1-8e70-4f97-80bb-0d72c767649c"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# change dtype of Duration (seconds), Calories Burned and Heart Rate to integer\n",
|
|
"data[\"duration_seconds\"] = data[\"duration_seconds\"].astype(int)\n",
|
|
"data[\"calories_burned\"] = data[\"calories_burned\"].astype(int)\n",
|
|
"data[\"heart_rate\"] = data[\"heart_rate\"].astype(int)\n",
|
|
"data.dtypes"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "414362eb",
|
|
"metadata": {
|
|
"id": "414362eb"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# save the cleaned data to a new csv file\n",
|
|
"data.to_csv('data/cleaned/cleaned_activities.csv', index=False)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "2474dbe2",
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/"
|
|
},
|
|
"id": "2474dbe2",
|
|
"outputId": "1d27257a-b133-4d13-eed4-72b5abe45b78"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# how many unique activity types are there\n",
|
|
"data['activity_type'].nunique()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "b99e6f62",
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/"
|
|
},
|
|
"id": "b99e6f62",
|
|
"outputId": "f9564a9d-3da3-4520-fce4-fe9eb6576620"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# check for rare categories in Activity Type\n",
|
|
"print(data['activity_type'].value_counts())"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "5e2ec61f",
|
|
"metadata": {
|
|
"id": "5e2ec61f"
|
|
},
|
|
"source": [
|
|
"At this point we see there are 14 types of activities. To simplfy we can group some of similar activities.\n",
|
|
"\n",
|
|
"Gehen, Wandern --> Gehen\n",
|
|
"Indoor Cycling, Rennradfahren, Virtuelles Radfahren, Radfahren--> Radfahren\n",
|
|
"\n",
|
|
"Since there are for Crosstrainer 3, Skifahren 2 and Stepper 1 rows it is better to drop these rows because our model cannot learn from so few examples."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "b9e1803b",
|
|
"metadata": {
|
|
"id": "b9e1803b"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# group similar activities\n",
|
|
"data['activity_type'] = data['activity_type'].replace({\n",
|
|
" 'Wandern': 'Gehen',\n",
|
|
" 'Indoor Cycling': 'Radfahren',\n",
|
|
" 'Rennradfahren': 'Radfahren',\n",
|
|
" 'Virtuelles Radfahren': 'Radfahren',\n",
|
|
" 'Laufbandtraining': 'Laufen'\n",
|
|
"})"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "gLJC0lbOVkiS",
|
|
"metadata": {
|
|
"id": "gLJC0lbOVkiS"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# drop the rare activity types\n",
|
|
"rare_activities = ['Crosstrainer', 'Skifahren (Piste)', 'Stepper']\n",
|
|
"data = data[~data['activity_type'].isin(rare_activities)]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "c265f4b2",
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/"
|
|
},
|
|
"id": "c265f4b2",
|
|
"outputId": "794da668-bc35-4103-a20b-60688ac0fe69"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# checking the final counts of activity types\n",
|
|
"print(data['activity_type'].value_counts())"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "20d39609",
|
|
"metadata": {
|
|
"id": "20d39609"
|
|
},
|
|
"source": [
|
|
"Now we have 6 Activity Types with occurences varying from 12 to 183."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "6ec79f7b",
|
|
"metadata": {
|
|
"id": "6ec79f7b"
|
|
},
|
|
"source": [
|
|
"# Correlation Matrix"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "b1bef2f0",
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 853
|
|
},
|
|
"id": "b1bef2f0",
|
|
"outputId": "0f053bd4-120f-4eed-96c0-5f8c37835242"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Correlation matrix\n",
|
|
"correlation_matrix = data.corr(numeric_only=True, method='pearson')\n",
|
|
"\n",
|
|
"plt.figure(figsize=(12, 10))\n",
|
|
"sns.heatmap(correlation_matrix, annot=True, cmap=\"coolwarm\", center=0)\n",
|
|
"plt.title(\"Korrelationsmatrix der numerischen Merkmale (Pearson)\")\n",
|
|
"plt.show()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "53d208e4",
|
|
"metadata": {
|
|
"id": "53d208e4"
|
|
},
|
|
"source": [
|
|
"# Prepare training and test sets"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "07cc0009",
|
|
"metadata": {
|
|
"id": "07cc0009"
|
|
},
|
|
"source": [
|
|
"Our data is cleaned and ready.\n",
|
|
"Our first column \"Activity Type\" is string and we will use \"One Hot Encoding\" to involve it to our linear regression model."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "a165c69e",
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/"
|
|
},
|
|
"id": "a165c69e",
|
|
"outputId": "3334cca8-3749-448f-f1fc-117b65f158c6"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# define features and target variable\n",
|
|
"features = ['activity_type', 'elevation_meters', 'duration_seconds', 'heart_rate'] # 'distance_km' excluded\n",
|
|
"target = 'calories_burned'\n",
|
|
"\n",
|
|
"x = data[features]\n",
|
|
"y = data[target]\n",
|
|
"\n",
|
|
"# split the data into training and testing sets\n",
|
|
"x_train, x_test, y_train, y_test = train_test_split(\n",
|
|
" x, y, test_size=0.2, random_state=42)\n",
|
|
"print(\n",
|
|
" f\"Data split into {x_train.shape[0]} training rows and {x_test.shape[0]} testing rows.\")\n",
|
|
"\n",
|
|
"# Preprocessing: Scaling numeric features and encoding categorical features\n",
|
|
"numeric_features = ['elevation_meters',\n",
|
|
" 'duration_seconds', 'heart_rate'] # 'distance_km' excluded\n",
|
|
"categorical_features = ['activity_type']\n",
|
|
"preprocessor = ColumnTransformer(\n",
|
|
" transformers=[\n",
|
|
" ('num', StandardScaler(), numeric_features),\n",
|
|
" ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)\n",
|
|
" ]\n",
|
|
")\n",
|
|
"\n",
|
|
"x_train = preprocessor.fit_transform(x_train)\n",
|
|
"x_test = preprocessor.transform(x_test)\n",
|
|
"\n",
|
|
"print(\"\\n--- Preprocessing complete ---\")\n",
|
|
"print(\"Shape of processed training data:\", x_train.shape)\n",
|
|
"print(\"Shape of processed testing data:\", x_test.shape)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "cb437880",
|
|
"metadata": {
|
|
"id": "cb437880"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Evaluation Function\n",
|
|
"def evaluate_results(y_pred, y_true):\n",
|
|
" mse = mean_squared_error(y_true, y_pred)\n",
|
|
" r2 = r2_score(y_true, y_pred)\n",
|
|
" mae = np.mean(np.abs(y_true - y_pred))\n",
|
|
" rmse = np.sqrt(mse)\n",
|
|
" print(f\"Mean Squared Error: {mse:.2f}\")\n",
|
|
" print(f\"R^2 Score: {r2:.2f}\")\n",
|
|
" print(f\"Mean Absolute Error: {mae:.2f}\")\n",
|
|
" print(f\"Root Mean Squared Error: {rmse:.2f}\")\n",
|
|
" return {\"mse\": mse, \"r2\": r2, \"mae\": mae, \"rmse\": rmse}"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "f79e360a",
|
|
"metadata": {
|
|
"id": "f79e360a"
|
|
},
|
|
"source": [
|
|
"We will use **R-Squared** to see how well the line fits the data.\n",
|
|
"\n",
|
|
"We will use **Mean Absolute Error** to see how wrong the predictions are, on average.\n",
|
|
"\n",
|
|
"We will use **Mean Squared Error** to see if we have large errors.\n",
|
|
"\n",
|
|
"We will use **Root Mean Square Error** to be able to compare MSE with MAE."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "f8273d79",
|
|
"metadata": {
|
|
"id": "f8273d79"
|
|
},
|
|
"source": [
|
|
"# Lineare Regression"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "e0841794",
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/"
|
|
},
|
|
"id": "e0841794",
|
|
"outputId": "77383cfb-8a97-4ecf-a679-396e80a84d47"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"def run_linear_regression(x_train, y_train, x_test, y_test):\n",
|
|
" # Create the model\n",
|
|
" lr_model = LinearRegression()\n",
|
|
" print(\"\\n--- Linear Regression model created ---\")\n",
|
|
"\n",
|
|
" # Train the Model\n",
|
|
" lr_model.fit(x_train, y_train)\n",
|
|
" print(\"\\n--- Model training complete ---\")\n",
|
|
"\n",
|
|
" # Make predictions on the test set\n",
|
|
" y_pred = lr_model.predict(x_test)\n",
|
|
" print(\"\\n--- Predictions on test set complete ---\")\n",
|
|
"\n",
|
|
" print(\"Linear Regression result evaluation: \")\n",
|
|
" return evaluate_results(y_pred, y_test)\n",
|
|
"\n",
|
|
"\n",
|
|
"run_linear_regression(x_train, y_train, x_test, y_test)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "4ec3f04a",
|
|
"metadata": {
|
|
"id": "4ec3f04a"
|
|
},
|
|
"source": [
|
|
"# Decision Tree Regression"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "f1c72308",
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/"
|
|
},
|
|
"id": "f1c72308",
|
|
"outputId": "3c318e90-15aa-4005-8026-47a48b4c89e2"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"def run_decision_tree_regressor(x_train, y_train, x_test, y_test):\n",
|
|
" # Create the model\n",
|
|
" tree = DecisionTreeRegressor(random_state=42, max_depth=None,\n",
|
|
" min_samples_split=2, min_samples_leaf=1, criterion='squared_error')\n",
|
|
" print(\"\\n--- Decision Tree Regressor (max_depth=None, min_samples_split=2, min_samples_leaf=1, criterion='squared_error') model created ---\")\n",
|
|
"\n",
|
|
" # Train the Model\n",
|
|
" tree.fit(x_train, y_train)\n",
|
|
" print(\"\\n--- Model training complete ---\")\n",
|
|
"\n",
|
|
" # Make predictions on the test set\n",
|
|
" y_pred = tree.predict(x_test)\n",
|
|
" print(\"\\n--- Predictions on test set complete ---\")\n",
|
|
"\n",
|
|
" print(\"Decision Tree Regressor result evaluation: \")\n",
|
|
" return evaluate_results(y_pred, y_test)\n",
|
|
"\n",
|
|
"\n",
|
|
"run_decision_tree_regressor(x_train, y_train, x_test, y_test)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "3e161de4",
|
|
"metadata": {
|
|
"id": "3e161de4"
|
|
},
|
|
"source": [
|
|
"# Desicion Tree Regression with Grid Search"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "fd75750e",
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/"
|
|
},
|
|
"id": "fd75750e",
|
|
"outputId": "2e67a93c-5de9-444b-e9e2-000f3a46f13d"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"def run_decision_tree_regressor_with_grid_search(x_train, y_train, x_test, y_test):\n",
|
|
" # Define Hyperparameters\n",
|
|
" param_grid = {\n",
|
|
" 'max_depth': [3, 5, 7, 10, None], # None means no limit\n",
|
|
" 'min_samples_split': [2, 5, 10],\n",
|
|
" 'min_samples_leaf': [1, 2, 4],\n",
|
|
" 'criterion': ['squared_error', 'absolute_error']\n",
|
|
" }\n",
|
|
"\n",
|
|
" # Create the model\n",
|
|
" tree = DecisionTreeRegressor(random_state=42)\n",
|
|
"\n",
|
|
" # Set up Grid Search\n",
|
|
" grid_search = GridSearchCV(\n",
|
|
" estimator=tree,\n",
|
|
" param_grid=param_grid,\n",
|
|
" scoring='neg_mean_squared_error',\n",
|
|
" cv=5,\n",
|
|
" verbose=1,\n",
|
|
" n_jobs=-1\n",
|
|
" )\n",
|
|
"\n",
|
|
" # Train the model with Grid Search\n",
|
|
" print(\"Starting Grid Search...\")\n",
|
|
" grid_search.fit(x_train, y_train)\n",
|
|
" print(\"Grid Search complete.\")\n",
|
|
"\n",
|
|
" best_parameters = grid_search.best_params_\n",
|
|
" print(f\"Best Hyperparameters: {best_parameters}\")\n",
|
|
"\n",
|
|
" # Make predictions on the test set\n",
|
|
" best_dt_model = grid_search.best_estimator_\n",
|
|
" y_pred = best_dt_model.predict(x_test)\n",
|
|
"\n",
|
|
" print(\"\\n--- Predictions on test set complete ---\")\n",
|
|
" print(\"Tuned Decision Tree Regressor result evaluation: \")\n",
|
|
" return evaluate_results(y_pred, y_test)\n",
|
|
"\n",
|
|
"\n",
|
|
"run_decision_tree_regressor_with_grid_search(x_train, y_train, x_test, y_test)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "7cec34c4",
|
|
"metadata": {
|
|
"id": "7cec34c4"
|
|
},
|
|
"source": [
|
|
"# Gradient Boost Regression"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "8d6830ce",
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/"
|
|
},
|
|
"id": "8d6830ce",
|
|
"outputId": "706cd3db-ee89-451c-aa8a-c4c51b8b47b2"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"def run_gradient_boosting_regressor(x_train, y_train, x_test, y_test):\n",
|
|
" # Create the model\n",
|
|
" gbm = GradientBoostingRegressor(\n",
|
|
" n_estimators=100, learning_rate=0.1, random_state=42)\n",
|
|
" print(\"\\n--- Gradient Boosting Regressor (n_estimators=100, learning_rate=0.1, random_state=42) model created ---\")\n",
|
|
"\n",
|
|
" # Train the Model\n",
|
|
" gbm.fit(x_train, y_train)\n",
|
|
" print(\"\\n--- Model training complete ---\")\n",
|
|
"\n",
|
|
" # Make predictions on the test set\n",
|
|
" y_pred = gbm.predict(x_test)\n",
|
|
"\n",
|
|
" print(\"\\n--- Predictions on test set complete ---\")\n",
|
|
"\n",
|
|
" print(\"Gradient Boosting Regressor result evaluation: \")\n",
|
|
" return evaluate_results(y_pred, y_test)\n",
|
|
"\n",
|
|
"\n",
|
|
"run_gradient_boosting_regressor(x_train, y_train, x_test, y_test)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "21327986",
|
|
"metadata": {
|
|
"id": "21327986"
|
|
},
|
|
"source": [
|
|
"# Gradient Boost Regression with Grid Search"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "60bd449d",
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/"
|
|
},
|
|
"id": "60bd449d",
|
|
"outputId": "d31670c8-09d7-4cfe-862d-9183fb86618c"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"def run_gradient_boosting_regressor_with_grid_search(x_train, y_train, x_test, y_test):\n",
|
|
" # Define Hyperparameters\n",
|
|
" param_grid_gbr = {\n",
|
|
" 'n_estimators': [100, 300, 500],\n",
|
|
" 'learning_rate': [0.01, 0.05, 0.1],\n",
|
|
" 'subsample': [0.8, 1.0],\n",
|
|
" 'max_depth': [3, 5, 7],\n",
|
|
" 'min_samples_split': [5, 10],\n",
|
|
" 'min_samples_leaf': [3, 5],\n",
|
|
" }\n",
|
|
"\n",
|
|
" # Create the model\n",
|
|
" gbm = GradientBoostingRegressor(random_state=42)\n",
|
|
"\n",
|
|
" # Set up Grid Search\n",
|
|
" grid_search = GridSearchCV(\n",
|
|
" estimator=gbm,\n",
|
|
" param_grid=param_grid_gbr,\n",
|
|
" scoring='neg_mean_squared_error',\n",
|
|
" cv=5,\n",
|
|
" verbose=1,\n",
|
|
" n_jobs=-1\n",
|
|
" )\n",
|
|
"\n",
|
|
" # Train the model with Grid Search\n",
|
|
" print(\"Starting Grid Search...\")\n",
|
|
" grid_search.fit(x_train, y_train)\n",
|
|
" print(\"Grid Search complete.\")\n",
|
|
"\n",
|
|
" best_parameters = grid_search.best_params_\n",
|
|
" print(f\"Best Hyperparameters: {best_parameters}\")\n",
|
|
"\n",
|
|
" best_dt_model = grid_search.best_estimator_\n",
|
|
" y_pred = best_dt_model.predict(x_test)\n",
|
|
"\n",
|
|
" print(\"\\n--- Predictions on test set complete ---\")\n",
|
|
" print(\"Tuned Gradient Boosting Regressor result evaluation: \")\n",
|
|
" return evaluate_results(y_pred, y_test)\n",
|
|
"\n",
|
|
"\n",
|
|
"run_gradient_boosting_regressor_with_grid_search(\n",
|
|
" x_train, y_train, x_test, y_test)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "5cff7679",
|
|
"metadata": {
|
|
"id": "5cff7679"
|
|
},
|
|
"source": [
|
|
"# Compare all models"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "2dc752bf",
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 1000
|
|
},
|
|
"id": "2dc752bf",
|
|
"outputId": "89541389-7b33-47ac-9ff4-bb4337ef9953"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"\n",
|
|
"linear_regession_results = run_linear_regression(\n",
|
|
" x_train, y_train, x_test, y_test)\n",
|
|
"desicion_tree_results = run_decision_tree_regressor(\n",
|
|
" x_train, y_train, x_test, y_test)\n",
|
|
"tuned_decision_tree_results = run_decision_tree_regressor_with_grid_search(\n",
|
|
" x_train, y_train, x_test, y_test)\n",
|
|
"gradient_boosting_results = run_gradient_boosting_regressor(\n",
|
|
" x_train, y_train, x_test, y_test)\n",
|
|
"tuned_gradient_boosting_results = run_gradient_boosting_regressor_with_grid_search(\n",
|
|
" x_train, y_train, x_test, y_test)\n",
|
|
"\n",
|
|
"results_data = {\n",
|
|
" 'Linear Regression': linear_regession_results,\n",
|
|
" 'Decision Tree': desicion_tree_results,\n",
|
|
" 'Tuned Decision Tree': tuned_decision_tree_results,\n",
|
|
" 'Gradient Boosting': gradient_boosting_results,\n",
|
|
" 'Tuned Gradient Boosting': tuned_gradient_boosting_results\n",
|
|
"}\n",
|
|
"\n",
|
|
"df_results = pd.DataFrame(results_data).T.sort_values(by='r2', ascending=False)\n",
|
|
"df_results"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "06b46bdd",
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 557
|
|
},
|
|
"id": "06b46bdd",
|
|
"outputId": "d690517e-629b-4c0d-baba-ac1a3cc23e48"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"plt.figure(figsize=(10, 6))\n",
|
|
"df_results['r2'].sort_values(ascending=False).plot(kind='bar', color='skyblue')\n",
|
|
"plt.title('$R^2$ Score Comparison Across Models', fontsize=14)\n",
|
|
"plt.ylabel('$R^2$ Score')\n",
|
|
"plt.xticks(rotation=45, ha='right')\n",
|
|
"plt.grid(axis='y', linestyle='--', alpha=0.7)\n",
|
|
"plt.tight_layout()\n",
|
|
"plt.show()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "e3ef5182",
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 558
|
|
},
|
|
"id": "e3ef5182",
|
|
"outputId": "80155472-d253-414e-9d41-c4e4ae9c9e3e"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"plt.figure(figsize=(10, 6))\n",
|
|
"df_results['rmse'].sort_values(ascending=True).plot(kind='bar', color='salmon')\n",
|
|
"plt.title('RMSE Comparison Across Models', fontsize=14)\n",
|
|
"plt.ylabel('Root Mean Squared Error (RMSE)')\n",
|
|
"plt.xticks(rotation=45, ha='right')\n",
|
|
"plt.grid(axis='y', linestyle='--', alpha=0.7)\n",
|
|
"plt.tight_layout()\n",
|
|
"plt.show()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "yapGyrMXyh0p",
|
|
"metadata": {
|
|
"id": "yapGyrMXyh0p"
|
|
},
|
|
"source": [
|
|
"Since we have the best results with Gradient Boosting it is worth trying XGBoost (Xtreme Grdient Boosting) to see if we can improve our scores."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "8iMxMaVgloWc",
|
|
"metadata": {
|
|
"id": "8iMxMaVgloWc"
|
|
},
|
|
"source": [
|
|
"# XGBoost"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "372740c4",
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/"
|
|
},
|
|
"id": "372740c4",
|
|
"outputId": "891affd7-be71-42f9-8d0d-73b1cda28d58"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"def run_xgboost_regressor(x_train, y_train, x_test, y_test):\n",
|
|
" # Create the model\n",
|
|
" xgb_model = XGBRegressor(random_state=42)\n",
|
|
" print(\"\\n--- XGBoost Regressor model created ---\")\n",
|
|
"\n",
|
|
" # Train the Model\n",
|
|
" xgb_model.fit(x_train, y_train)\n",
|
|
" print(\"\\n--- Model training complete ---\")\n",
|
|
"\n",
|
|
" # Make predictions on the test set\n",
|
|
" y_pred = xgb_model.predict(x_test)\n",
|
|
" print(\"\\n--- Predictions on test set complete ---\")\n",
|
|
"\n",
|
|
" print(\"XGBoost Regressor result evaluation: \")\n",
|
|
" return evaluate_results(y_pred, y_test)\n",
|
|
"\n",
|
|
"\n",
|
|
"run_xgboost_regressor(x_train, y_train, x_test, y_test)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "lXt3E4fszcND",
|
|
"metadata": {
|
|
"id": "lXt3E4fszcND"
|
|
},
|
|
"source": [
|
|
"## We see no improvement :("
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "d5883f19",
|
|
"metadata": {
|
|
"id": "d5883f19"
|
|
},
|
|
"source": [
|
|
"## Execute Outlier Analysis\n",
|
|
"\n",
|
|
"Generate plots for residuals versus predicted values, the distribution of residuals, and actual versus predicted values. Additionally, identify and display the top data points with the largest absolute residuals for the best-performing Gradient Boosting Regressor model.\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "KkoYtlsRnE97",
|
|
"metadata": {
|
|
"id": "KkoYtlsRnE97"
|
|
},
|
|
"source": [
|
|
"In all models we observed high Mean Squared Errors, which necessitates to do an outlier analysis."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "33400276",
|
|
"metadata": {
|
|
"id": "33400276"
|
|
},
|
|
"source": [
|
|
"We will do an outlier analysis for the best-performing Gradient Boosting Regressor model. This involves generating plots for residuals versus predicted values, the distribution of residuals, and actual versus predicted values. Additionally, identify and display the top data points with the largest absolute residuals. Finally, summarize the findings from the outlier analysis, discuss potential reasons for the large residuals, and suggest next steps for handling these outliers."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "1e7989b6",
|
|
"metadata": {
|
|
"id": "1e7989b6"
|
|
},
|
|
"source": [
|
|
"**Reasoning**:\n",
|
|
"First, I will re-instantiate the Gradient Boosting Regressor model with the specified parameters and train it on the `x_train` and `y_train` datasets, then make predictions on `x_test`. This is necessary to perform the outlier analysis and plotting.\n",
|
|
"\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "b714240a",
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/"
|
|
},
|
|
"id": "b714240a",
|
|
"outputId": "99276ba1-cf38-48df-d4b0-384b6181cc1d"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"gbm = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42)\n",
|
|
"gbm.fit(x_train, y_train)\n",
|
|
"y_pred_gbm = gbm.predict(x_test)\n",
|
|
"\n",
|
|
"print(\"Gradient Boosting Regressor trained and predictions made.\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "d60767d9",
|
|
"metadata": {
|
|
"id": "d60767d9"
|
|
},
|
|
"source": [
|
|
"Now that the model is trained and predictions are made, *we* will calculate the residuals, generate the requested plots for outlier analysis, and identify the top data points with the largest absolute residuals.\n",
|
|
"\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "46db5f55",
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 502
|
|
},
|
|
"id": "46db5f55",
|
|
"outputId": "cfa20afb-80b7-4b75-d845-63442d8c4cb9"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"residuals = y_test - y_pred_gbm\n",
|
|
"\n",
|
|
"plt.figure(figsize=(18, 5))\n",
|
|
"\n",
|
|
"# Plot 1: Residuals vs Predicted Values\n",
|
|
"plt.subplot(1, 3, 1)\n",
|
|
"sns.scatterplot(x=y_pred_gbm, y=residuals)\n",
|
|
"plt.axhline(y=0, color='r', linestyle='--')\n",
|
|
"plt.xlabel('Predicted Calories Burned')\n",
|
|
"plt.ylabel('Residuals')\n",
|
|
"plt.title('Residuals vs. Predicted Values')\n",
|
|
"\n",
|
|
"# Plot 2: Distribution of Residuals\n",
|
|
"plt.subplot(1, 3, 2)\n",
|
|
"sns.histplot(residuals, kde=True)\n",
|
|
"plt.xlabel('Residuals')\n",
|
|
"plt.ylabel('Frequency')\n",
|
|
"plt.title('Distribution of Residuals')\n",
|
|
"\n",
|
|
"# Plot 3: Actual vs Predicted Values\n",
|
|
"plt.subplot(1, 3, 3)\n",
|
|
"sns.scatterplot(x=y_test, y=y_pred_gbm)\n",
|
|
"plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)\n",
|
|
"plt.xlabel('Actual Calories Burned')\n",
|
|
"plt.ylabel('Predicted Calories Burned')\n",
|
|
"plt.title('Actual vs. Predicted Values')\n",
|
|
"\n",
|
|
"plt.tight_layout()\n",
|
|
"plt.show()\n",
|
|
"\n",
|
|
"# Combine y_test, y_pred_gbm, and residuals into a DataFrame\n",
|
|
"outlier_df = pd.DataFrame({'y_test': y_test, 'y_pred_gbm': y_pred_gbm, 'residuals': residuals})\n",
|
|
"\n",
|
|
"# Calculate absolute residuals\n",
|
|
"outlier_df['abs_residuals'] = np.abs(residuals)\n",
|
|
"\n",
|
|
"# Sort by absolute residuals in descending order and display top 10\n",
|
|
"print(\"\\nTop 10 data points with largest absolute residuals:\")\n",
|
|
"print(outlier_df.sort_values(by='abs_residuals', ascending=False).head(10))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "a8b453db",
|
|
"metadata": {
|
|
"id": "a8b453db"
|
|
},
|
|
"source": [
|
|
"## Summary:\n",
|
|
"\n",
|
|
"### Data Analysis Key Findings\n",
|
|
"\n",
|
|
"* **Residuals vs. Predicted Values Plot**: The plot generally shows a scattered pattern around the zero residual line, which is positive. However, it also highlights specific points far from this line, indicating significant prediction errors (outliers) for those instances.\n",
|
|
"* **Distribution of Residuals Plot**: The distribution is somewhat bell-shaped, implying most errors are centered around zero. Nevertheless, the extended tails confirm the presence of numerous large positive and negative residuals, suggesting the model struggles with a subset of data points.\n",
|
|
"* **Actual vs. Predicted Values Plot**: This plot displays a strong linear relationship between actual and predicted values, closely following the ideal diagonal line. However, some points, especially at higher calorie burn values, deviate notably from this line, corresponding to the observed large residuals.\n",
|
|
"* **Top Outliers Identified**: The analysis revealed data points with substantial prediction errors. For example, the largest absolute residual was approximately 490.70, where the actual `calories_burned` was 1934, indicating a significant underprediction by the model. Other notable underpredictions include actual values of 1934 (residual of 351.34) and 2817 (residual of 343.86).\n",
|
|
"\n",
|
|
"### Insights or Next Steps\n",
|
|
"\n",
|
|
"* **Investigate Outlier Characteristics**: Analyze the raw data for the top outlier rows (e.g., indices 487, 75, 0) to determine if these are genuine activity anomalies, data entry errors, or sensor malfunctions. This deeper dive could reveal patterns or specific conditions under which the model performs poorly.\n",
|
|
"* **Enhance Feature Set**: Consider enriching the model with additional features that might explain the large residuals. This could involve creating new features like `pace` (duration/distance) or `average_speed` from existing data, or incorporating external factors such as user-specific attributes (e.g., body weight, age) or environmental conditions, if available.\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "QoepqlPj1DwC",
|
|
"metadata": {
|
|
"id": "QoepqlPj1DwC"
|
|
},
|
|
"source": [
|
|
"# Summary of Outlier Analysis for Gradient Boosting Regressor"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "77ad53a8",
|
|
"metadata": {
|
|
"id": "77ad53a8"
|
|
},
|
|
"source": [
|
|
"# Taking a look at the top 10 outliers"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "_aEp_izhqV5T",
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 363
|
|
},
|
|
"id": "_aEp_izhqV5T",
|
|
"outputId": "2c881b53-7139-4baf-97d4-8dc353cdc69f"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"outlier_index = [487, 75, 15, 0, 311, 361, 325, 101, 33, 140]\n",
|
|
"display(data.loc[outlier_index])\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "0b7IfcWAuNCi",
|
|
"metadata": {
|
|
"id": "0b7IfcWAuNCi"
|
|
},
|
|
"source": [
|
|
"1. Observation 487 - This is a significant calorie burn for an 18km run in 2.5 hours. A heart rate of 164 is quite high. This could be a very intense run, maybe a particularly challenging terrain given the elevation. The model underpredicted this by ~490 calories.\n",
|
|
"\n",
|
|
"2. Observation 75 - Over 100km cycling with significant elevation and a heart rate of 143. This is a substantial ride. The calorie burn is very high, but plausible for such a long effort. Your model underpredicted this by ~351 calories.\n",
|
|
"\n",
|
|
"3. Observation 15 - 40 km cycling with a Calorie Burn of 1027. The model underpredicted this by ~192 Calories"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "9de0df98",
|
|
"metadata": {
|
|
"id": "9de0df98"
|
|
},
|
|
"source": [
|
|
"## Dropping Outliers\n",
|
|
"\n",
|
|
"Dropping outlier rows 487 and 75 because they represent the largest underpredictions by the Gradient Boosting model. These rows correspond to extreme and unusual activities that the model struggled to capture, and their removal is intended to improve the model's overall generalization and reduce prediction errors on more typical data points.\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "39c15b5f",
|
|
"metadata": {
|
|
"id": "39c15b5f"
|
|
},
|
|
"source": [
|
|
"# Task\n",
|
|
"Drop the rows with indices 487 and 75 from the `data` DataFrame. Then, re-run all subsequent data preprocessing steps, including `duration_str` conversion, converting other data types, grouping similar 'activity_type' categories, and applying feature scaling and one-hot encoding. Next, re-run all the model training and evaluation cells for Linear Regression, Decision Tree (tuned and untuned), Gradient Boosting (tuned and untuned), and XGBoost (tuned and untuned) using the updated preprocessed data. Finally, generate comparison plots and analyze the performance metrics (R-squared, RMSE, MAE) of all models to evaluate the impact of dropping the identified outliers, and summarize the impact of this outlier treatment on model performance, identifying the best-performing model and discussing improvements in prediction accuracy and generalizability."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "b94757d9",
|
|
"metadata": {
|
|
"id": "b94757d9"
|
|
},
|
|
"source": [
|
|
"## Re-execute Data Preprocessing\n",
|
|
"\n",
|
|
"### Subtask:\n",
|
|
"Drop specified outliers and re-run all data preprocessing steps.\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "b408548b",
|
|
"metadata": {
|
|
"id": "b408548b"
|
|
},
|
|
"source": [
|
|
"**Reasoning**:\n",
|
|
"I will re-load the original dataset, drop the specified outlier rows, and then apply all the data preprocessing steps as outlined in the instructions to prepare the data for re-training the models. This includes converting data types, handling missing values, grouping activity types, dropping rare activity types, defining features and target, and finally performing train-test split and preprocessing using StandardScaler and OneHotEncoder.\n",
|
|
"\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "e3e40db1",
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/"
|
|
},
|
|
"id": "e3e40db1",
|
|
"outputId": "de1486b5-c6fd-4973-fff5-be50347f566c"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"import pandas as pd\n",
|
|
"import numpy as np\n",
|
|
"from sklearn.model_selection import train_test_split\n",
|
|
"from sklearn.compose import ColumnTransformer\n",
|
|
"from sklearn.preprocessing import StandardScaler, OneHotEncoder\n",
|
|
"\n",
|
|
"# 1. Reload the initial dataset\n",
|
|
"cols = {\n",
|
|
" \"Aktivitätstyp\": \"activity_type\",\n",
|
|
" \"Distanz\": \"distance_km\",\n",
|
|
" \"Anstieg gesamt\": \"elevation_meters\",\n",
|
|
" \"Kalorien\": \"calories_burned\",\n",
|
|
" \"Zeit\": \"duration_str\",\n",
|
|
" \"Ø Herzfrequenz\": \"heart_rate\"\n",
|
|
"}\n",
|
|
"\n",
|
|
"data = pd.read_csv(\n",
|
|
" \"data/raw/all_activities.csv\",\n",
|
|
" usecols=list(cols.keys())\n",
|
|
")\n",
|
|
"data.rename(columns=cols, inplace=True)\n",
|
|
"\n",
|
|
"# 2. Drop the rows with original indices 487 and 75\n",
|
|
"data.drop(index=[487, 75], inplace=True)\n",
|
|
"\n",
|
|
"# 3. Convert duration to seconds and drop original string column\n",
|
|
"data[\"duration_seconds\"] = pd.to_timedelta(\n",
|
|
" data[\"duration_str\"]).dt.total_seconds()\n",
|
|
"data.drop(columns=[\"duration_str\"], inplace=True)\n",
|
|
"\n",
|
|
"# 4. Clean and convert to numeric types by removing commas\n",
|
|
"data['distance_km'] = pd.to_numeric(\n",
|
|
" data['distance_km'].str.replace(',', ''), errors='coerce')\n",
|
|
"data['elevation_meters'] = pd.to_numeric(\n",
|
|
" data['elevation_meters'].str.replace(',', ''), errors='coerce')\n",
|
|
"data['calories_burned'] = pd.to_numeric(\n",
|
|
" data['calories_burned'].str.replace(',', ''), errors='coerce')\n",
|
|
"data['heart_rate'] = pd.to_numeric(\n",
|
|
" data['heart_rate'].str.replace(',', ''), errors='coerce')\n",
|
|
"\n",
|
|
"# 5. Fill missing 'elevation_meters' with 0\n",
|
|
"data['elevation_meters'].fillna(0, inplace=True)\n",
|
|
"\n",
|
|
"# 6. Drop any other rows with missing values\n",
|
|
"data = data.dropna()\n",
|
|
"\n",
|
|
"# 7. Convert data types to integers\n",
|
|
"data[\"duration_seconds\"] = data[\"duration_seconds\"].astype(int)\n",
|
|
"data[\"calories_burned\"] = data[\"calories_burned\"].astype(int)\n",
|
|
"data[\"heart_rate\"] = data[\"heart_rate\"].astype(int)\n",
|
|
"\n",
|
|
"# 8. Group similar activity types\n",
|
|
"data['activity_type'] = data['activity_type'].replace({\n",
|
|
" 'Wandern': 'Gehen',\n",
|
|
" 'Indoor Cycling': 'Radfahren',\n",
|
|
" 'Rennradfahren': 'Radfahren',\n",
|
|
" 'Virtuelles Radfahren': 'Radfahren',\n",
|
|
" 'Laufbandtraining': 'Laufen'\n",
|
|
"})\n",
|
|
"\n",
|
|
"# 9. Drop rare activity types\n",
|
|
"rare_activities = ['Crosstrainer', 'Skifahren (Piste)', 'Stepper']\n",
|
|
"data = data[~data['activity_type'].isin(rare_activities)]\n",
|
|
"\n",
|
|
"# 10. Define features and target variable\n",
|
|
"features = ['activity_type', 'elevation_meters', 'duration_seconds', 'heart_rate']\n",
|
|
"target = 'calories_burned'\n",
|
|
"\n",
|
|
"x = data[features]\n",
|
|
"y = data[target]\n",
|
|
"\n",
|
|
"# 11. Split the data into training and testing sets\n",
|
|
"x_train, x_test, y_train, y_test = train_test_split(\n",
|
|
" x, y, test_size=0.2, random_state=42)\n",
|
|
"print(f\"Data split into {x_train.shape[0]} training rows and {x_test.shape[0]} testing rows.\")\n",
|
|
"\n",
|
|
"# 12 & 13. Apply preprocessing\n",
|
|
"numeric_features = ['elevation_meters',\n",
|
|
" 'duration_seconds', 'heart_rate']\n",
|
|
"categorical_features = ['activity_type']\n",
|
|
"preprocessor = ColumnTransformer(\n",
|
|
" transformers=[\n",
|
|
" ('num', StandardScaler(), numeric_features),\n",
|
|
" ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)\n",
|
|
" ]\n",
|
|
")\n",
|
|
"\n",
|
|
"x_train = preprocessor.fit_transform(x_train)\n",
|
|
"x_test = preprocessor.transform(x_test)\n",
|
|
"\n",
|
|
"print(\"\\n--- Preprocessing complete ---\")\n",
|
|
"print(\"Shape of processed training data:\", x_train.shape)\n",
|
|
"print(\"Shape of processed testing data:\", x_test.shape)\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "605d91fc",
|
|
"metadata": {
|
|
"id": "605d91fc"
|
|
},
|
|
"source": [
|
|
"**Reasoning**:\n",
|
|
"The previous execution produced a `FutureWarning` due to the `inplace=True` argument used with `fillna()` on a sliced DataFrame. I will update the line `data['elevation_meters'].fillna(0, inplace=True)` to `data['elevation_meters'] = data['elevation_meters'].fillna(0)` to explicitly reassign the result, thus avoiding the warning and ensuring the operation is performed correctly on the original DataFrame.\n",
|
|
"\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "671dd55c",
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/"
|
|
},
|
|
"id": "671dd55c",
|
|
"outputId": "eed72d43-7850-40f5-8e4d-bada2c0dea77"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"import pandas as pd\n",
|
|
"import numpy as np\n",
|
|
"from sklearn.model_selection import train_test_split\n",
|
|
"from sklearn.compose import ColumnTransformer\n",
|
|
"from sklearn.preprocessing import StandardScaler, OneHotEncoder\n",
|
|
"\n",
|
|
"# 1. Reload the initial dataset\n",
|
|
"cols = {\n",
|
|
" \"Aktivitätstyp\": \"activity_type\",\n",
|
|
" \"Distanz\": \"distance_km\",\n",
|
|
" \"Anstieg gesamt\": \"elevation_meters\",\n",
|
|
" \"Kalorien\": \"calories_burned\",\n",
|
|
" \"Zeit\": \"duration_str\",\n",
|
|
" \"Ø Herzfrequenz\": \"heart_rate\"\n",
|
|
"}\n",
|
|
"\n",
|
|
"data = pd.read_csv(\n",
|
|
" \"data/raw/all_activities.csv\",\n",
|
|
" usecols=list(cols.keys())\n",
|
|
")\n",
|
|
"data.rename(columns=cols, inplace=True)\n",
|
|
"\n",
|
|
"# 2. Drop the rows with original indices 487 and 75\n",
|
|
"data.drop(index=[487, 75], inplace=True)\n",
|
|
"\n",
|
|
"# 3. Convert duration to seconds and drop original string column\n",
|
|
"data[\"duration_seconds\"] = pd.to_timedelta(\n",
|
|
" data[\"duration_str\"]).dt.total_seconds()\n",
|
|
"data.drop(columns=[\"duration_str\"], inplace=True)\n",
|
|
"\n",
|
|
"# 4. Clean and convert to numeric types by removing commas\n",
|
|
"data['distance_km'] = pd.to_numeric(\n",
|
|
" data['distance_km'].str.replace(',', ''), errors='coerce')\n",
|
|
"data['elevation_meters'] = pd.to_numeric(\n",
|
|
" data['elevation_meters'].str.replace(',', ''), errors='coerce')\n",
|
|
"data['calories_burned'] = pd.to_numeric(\n",
|
|
" data['calories_burned'].str.replace(',', ''), errors='coerce')\n",
|
|
"data['heart_rate'] = pd.to_numeric(\n",
|
|
" data['heart_rate'].str.replace(',', ''), errors='coerce')\n",
|
|
"\n",
|
|
"# 5. Fill missing 'elevation_meters' with 0\n",
|
|
"data['elevation_meters'] = data['elevation_meters'].fillna(0)\n",
|
|
"\n",
|
|
"# 6. Drop any other rows with missing values\n",
|
|
"data = data.dropna()\n",
|
|
"\n",
|
|
"# 7. Convert data types to integers\n",
|
|
"data[\"duration_seconds\"] = data[\"duration_seconds\"].astype(int)\n",
|
|
"data[\"calories_burned\"] = data[\"calories_burned\"].astype(int)\n",
|
|
"data[\"heart_rate\"] = data[\"heart_rate\"].astype(int)\n",
|
|
"\n",
|
|
"# 8. Group similar activity types\n",
|
|
"data['activity_type'] = data['activity_type'].replace({\n",
|
|
" 'Wandern': 'Gehen',\n",
|
|
" 'Indoor Cycling': 'Radfahren',\n",
|
|
" 'Rennradfahren': 'Radfahren',\n",
|
|
" 'Virtuelles Radfahren': 'Radfahren',\n",
|
|
" 'Laufbandtraining': 'Laufen'\n",
|
|
"})\n",
|
|
"\n",
|
|
"# 9. Drop rare activity types\n",
|
|
"rare_activities = ['Crosstrainer', 'Skifahren (Piste)', 'Stepper']\n",
|
|
"data = data[~data['activity_type'].isin(rare_activities)]\n",
|
|
"\n",
|
|
"# 10. Define features and target variable\n",
|
|
"features = ['activity_type', 'elevation_meters', 'duration_seconds', 'heart_rate']\n",
|
|
"target = 'calories_burned'\n",
|
|
"\n",
|
|
"x = data[features]\n",
|
|
"y = data[target]\n",
|
|
"\n",
|
|
"# 11. Split the data into training and testing sets\n",
|
|
"x_train, x_test, y_train, y_test = train_test_split(\n",
|
|
" x, y, test_size=0.2, random_state=42)\n",
|
|
"print(f\"Data split into {x_train.shape[0]} training rows and {x_test.shape[0]} testing rows.\")\n",
|
|
"\n",
|
|
"# 12 & 13. Apply preprocessing\n",
|
|
"numeric_features = ['elevation_meters',\n",
|
|
" 'duration_seconds', 'heart_rate']\n",
|
|
"categorical_features = ['activity_type']\n",
|
|
"preprocessor = ColumnTransformer(\n",
|
|
" transformers=[\n",
|
|
" ('num', StandardScaler(), numeric_features),\n",
|
|
" ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)\n",
|
|
" ]\n",
|
|
")\n",
|
|
"\n",
|
|
"x_train = preprocessor.fit_transform(x_train)\n",
|
|
"x_test = preprocessor.transform(x_test)\n",
|
|
"\n",
|
|
"print(\"\\n--- Preprocessing complete ---\")\n",
|
|
"print(\"Shape of processed training data:\", x_train.shape)\n",
|
|
"print(\"Shape of processed testing data:\", x_test.shape)\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "39c1f456",
|
|
"metadata": {
|
|
"id": "39c1f456"
|
|
},
|
|
"source": [
|
|
"**Reasoning**:\n",
|
|
"Now that the data has been re-preprocessed after dropping outliers, I will re-run the Linear Regression model on the updated `x_train`, `y_train`, `x_test`, and `y_test` datasets to evaluate its performance with the refined data.\n",
|
|
"\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "05d6fbc6",
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/"
|
|
},
|
|
"id": "05d6fbc6",
|
|
"outputId": "3ffe5843-3e4f-45ec-ceb8-1d94ec85b1f7"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"def run_linear_regression(x_train, y_train, x_test, y_test):\n",
|
|
" # Create the model\n",
|
|
" lr_model = LinearRegression()\n",
|
|
" print(\"\\n--- Linear Regression model created ---\")\n",
|
|
"\n",
|
|
" # Train the Model\n",
|
|
" lr_model.fit(x_train, y_train)\n",
|
|
" print(\"\\n--- Model training complete ---\")\n",
|
|
"\n",
|
|
" # Make predictions on the test set\n",
|
|
" y_pred = lr_model.predict(x_test)\n",
|
|
" print(\"\\n--- Predictions on test set complete ---\")\n",
|
|
"\n",
|
|
" print(\"Linear Regression result evaluation: \")\n",
|
|
" return evaluate_results(y_pred, y_test)\n",
|
|
"\n",
|
|
"\n",
|
|
"linear_regession_results_cleaned = run_linear_regression(x_train, y_train, x_test, y_test)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "857ce5d4",
|
|
"metadata": {
|
|
"id": "857ce5d4"
|
|
},
|
|
"source": [
|
|
"**Reasoning**:\n",
|
|
"Now that the data has been re-preprocessed after dropping outliers, I will re-run the untuned Decision Tree Regressor model on the updated `x_train`, `y_train`, `x_test`, and `y_test` datasets to evaluate its performance with the refined data.\n",
|
|
"\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "766587eb",
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/"
|
|
},
|
|
"id": "766587eb",
|
|
"outputId": "80db77e5-60aa-4f76-b55b-4bd9ac54503d"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"def run_decision_tree_regressor(x_train, y_train, x_test, y_test):\n",
|
|
" # Create the model\n",
|
|
" tree = DecisionTreeRegressor(random_state=42, max_depth=None,\n",
|
|
" min_samples_split=2, min_samples_leaf=1, criterion='squared_error')\n",
|
|
" print(\"\\n--- Decision Tree Regressor (max_depth=None, min_samples_split=2, min_samples_leaf=1, criterion='squared_error') model created ---\")\n",
|
|
"\n",
|
|
" # Train the Model\n",
|
|
" tree.fit(x_train, y_train)\n",
|
|
" print(\"\\n--- Model training complete ---\")\n",
|
|
"\n",
|
|
" # Make predictions on the test set\n",
|
|
" y_pred = tree.predict(x_test)\n",
|
|
" print(\"\\n--- Predictions on test set complete ---\")\n",
|
|
"\n",
|
|
" print(\"Decision Tree Regressor result evaluation: \")\n",
|
|
" return evaluate_results(y_pred, y_test)\n",
|
|
"\n",
|
|
"desicion_tree_results_cleaned = run_decision_tree_regressor(x_train, y_train, x_test, y_test)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "8fafbaa1",
|
|
"metadata": {
|
|
"id": "8fafbaa1"
|
|
},
|
|
"source": [
|
|
"**Reasoning**:\n",
|
|
"Now that the data has been re-preprocessed after dropping outliers, I will re-run the Tuned Decision Tree Regressor model on the updated `x_train`, `y_train`, `x_test`, and `y_test` datasets to evaluate its performance with the refined data.\n",
|
|
"\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "9219db5f",
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/"
|
|
},
|
|
"id": "9219db5f",
|
|
"outputId": "49e7e1b3-9626-405c-e856-28c8e8ec7f8c"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"def run_decision_tree_regressor_with_grid_search(x_train, y_train, x_test, y_test):\n",
|
|
" # Define Hyperparameters\n",
|
|
" param_grid = {\n",
|
|
" 'max_depth': [3, 5, 7, 10, None], # None means no limit\n",
|
|
" 'min_samples_split': [2, 5, 10],\n",
|
|
" 'min_samples_leaf': [1, 2, 4],\n",
|
|
" 'criterion': ['squared_error', 'absolute_error']\n",
|
|
" }\n",
|
|
"\n",
|
|
" # Create the model\n",
|
|
" tree = DecisionTreeRegressor(random_state=42)\n",
|
|
"\n",
|
|
" # Set up Grid Search\n",
|
|
" grid_search = GridSearchCV(\n",
|
|
" estimator=tree,\n",
|
|
" param_grid=param_grid,\n",
|
|
" scoring='neg_mean_squared_error',\n",
|
|
" cv=5,\n",
|
|
" verbose=1,\n",
|
|
" n_jobs=-1\n",
|
|
" )\n",
|
|
"\n",
|
|
" # Train the model with Grid Search\n",
|
|
" print(\"Starting Grid Search...\")\n",
|
|
" grid_search.fit(x_train, y_train)\n",
|
|
" print(\"Grid Search complete.\")\n",
|
|
"\n",
|
|
" best_parameters = grid_search.best_params_\n",
|
|
" print(f\"Best Hyperparameters: {best_parameters}\")\n",
|
|
"\n",
|
|
" # Make predictions on the test set\n",
|
|
" best_dt_model = grid_search.best_estimator_\n",
|
|
" y_pred = best_dt_model.predict(x_test)\n",
|
|
"\n",
|
|
" print(\"\\n--- Predictions on test set complete ---\")\n",
|
|
" print(\"Tuned Decision Tree Regressor result evaluation: \")\n",
|
|
" return evaluate_results(y_pred, y_test)\n",
|
|
"\n",
|
|
"tuned_decision_tree_results_cleaned = run_decision_tree_regressor_with_grid_search(x_train, y_train, x_test, y_test)\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "f3262fc7",
|
|
"metadata": {
|
|
"id": "f3262fc7"
|
|
},
|
|
"source": [
|
|
"**Reasoning**:\n",
|
|
"Now that the data has been re-preprocessed after dropping outliers, I will re-run the untuned Gradient Boosting Regressor model on the updated `x_train`, `y_train`, `x_test`, and `y_test` datasets to evaluate its performance with the refined data.\n",
|
|
"\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "1cc43328",
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/"
|
|
},
|
|
"id": "1cc43328",
|
|
"outputId": "967cd748-e115-4e5f-8fa9-60303d16faf8"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"def run_gradient_boosting_regressor(x_train, y_train, x_test, y_test):\n",
|
|
" # Create the model\n",
|
|
" gbm = GradientBoostingRegressor(\n",
|
|
" n_estimators=100, learning_rate=0.1, random_state=42)\n",
|
|
" print(\"\\n--- Gradient Boosting Regressor (n_estimators=100, learning_rate=0.1, random_state=42) model created ---\")\n",
|
|
"\n",
|
|
" # Train the Model\n",
|
|
" gbm.fit(x_train, y_train)\n",
|
|
" print(\"\\n--- Model training complete ---\")\n",
|
|
"\n",
|
|
" # Make predictions on the test set\n",
|
|
" y_pred = gbm.predict(x_test)\n",
|
|
"\n",
|
|
" print(\"\\n--- Predictions on test set complete ---\")\n",
|
|
"\n",
|
|
" print(\"Gradient Boosting Regressor result evaluation: \")\n",
|
|
" return evaluate_results(y_pred, y_test)\n",
|
|
"\n",
|
|
"gradient_boosting_results_cleaned = run_gradient_boosting_regressor(x_train, y_train, x_test, y_test)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "1b694672",
|
|
"metadata": {
|
|
"id": "1b694672"
|
|
},
|
|
"source": [
|
|
"**Reasoning**:\n",
|
|
"Now that the data has been re-preprocessed after dropping outliers, I will re-run the Tuned Gradient Boosting Regressor model on the updated `x_train`, `y_train`, `x_test`, and `y_test` datasets to evaluate its performance with the refined data.\n",
|
|
"\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "4b8fb71d",
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/"
|
|
},
|
|
"id": "4b8fb71d",
|
|
"outputId": "34431583-faa2-4b50-a814-3b1724e3fe24"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"def run_gradient_boosting_regressor_with_grid_search(x_train, y_train, x_test, y_test):\n",
|
|
" # Define Hyperparameters\n",
|
|
" param_grid_gbr = {\n",
|
|
" 'n_estimators': [100, 300, 500],\n",
|
|
" 'learning_rate': [0.01, 0.05, 0.1],\n",
|
|
" 'subsample': [0.8, 1.0],\n",
|
|
" 'max_depth': [3, 5, 7],\n",
|
|
" 'min_samples_split': [5, 10],\n",
|
|
" 'min_samples_leaf': [3, 5],\n",
|
|
" }\n",
|
|
"\n",
|
|
" # Create the model\n",
|
|
" gbm = GradientBoostingRegressor(random_state=42)\n",
|
|
"\n",
|
|
" # Set up Grid Search\n",
|
|
" grid_search = GridSearchCV(\n",
|
|
" estimator=gbm,\n",
|
|
" param_grid=param_grid_gbr,\n",
|
|
" scoring='neg_mean_squared_error',\n",
|
|
" cv=5,\n",
|
|
" verbose=1,\n",
|
|
" n_jobs=-1\n",
|
|
" )\n",
|
|
"\n",
|
|
" # Train the model with Grid Search\n",
|
|
" print(\"Starting Grid Search...\")\n",
|
|
" grid_search.fit(x_train, y_train)\n",
|
|
" print(\"Grid Search complete.\")\n",
|
|
"\n",
|
|
" best_parameters = grid_search.best_params_\n",
|
|
" print(f\"Best Hyperparameters: {best_parameters}\")\n",
|
|
"\n",
|
|
" best_dt_model = grid_search.best_estimator_\n",
|
|
" y_pred = best_dt_model.predict(x_test)\n",
|
|
"\n",
|
|
" print(\"\\n--- Predictions on test set complete ---\")\n",
|
|
" print(\"Tuned Gradient Boosting Regressor result evaluation: \")\n",
|
|
" return evaluate_results(y_pred, y_test)\n",
|
|
"\n",
|
|
"tuned_gradient_boosting_results_cleaned = run_gradient_boosting_regressor_with_grid_search(\n",
|
|
" x_train, y_train, x_test, y_test)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "072cc903",
|
|
"metadata": {
|
|
"id": "072cc903"
|
|
},
|
|
"source": [
|
|
"**Reasoning**:\n",
|
|
"Now that the data has been re-preprocessed after dropping outliers, I will re-run the XGBoost Regressor model on the updated `x_train`, `y_train`, `x_test`, and `y_test` datasets to evaluate its performance with the refined data.\n",
|
|
"\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "f3c1e5c6",
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/"
|
|
},
|
|
"id": "f3c1e5c6",
|
|
"outputId": "df029bdd-6a92-4f99-c575-5c90f63cb855"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"from xgboost import XGBRegressor\n",
|
|
"\n",
|
|
"def run_xgboost_regressor(x_train, y_train, x_test, y_test):\n",
|
|
" # Create the model\n",
|
|
" xgb_model = XGBRegressor(random_state=42)\n",
|
|
" print(\"\\n--- XGBoost Regressor model created ---\")\n",
|
|
"\n",
|
|
" # Train the Model\n",
|
|
" xgb_model.fit(x_train, y_train)\n",
|
|
" print(\"\\n--- Model training complete ---\")\n",
|
|
"\n",
|
|
" # Make predictions on the test set\n",
|
|
" y_pred = xgb_model.predict(x_test)\n",
|
|
" print(\"\\n--- Predictions on test set complete ---\")\n",
|
|
"\n",
|
|
" print(\"XGBoost Regressor result evaluation: \")\n",
|
|
" return evaluate_results(y_pred, y_test)\n",
|
|
"\n",
|
|
"\n",
|
|
"xgboost_results_cleaned = run_xgboost_regressor(x_train, y_train, x_test, y_test)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "1d8b15eb",
|
|
"metadata": {
|
|
"id": "1d8b15eb"
|
|
},
|
|
"source": [
|
|
"**Reasoning**:\n",
|
|
"Now that all models have been re-trained and evaluated on the cleaned data, I will combine the performance metrics of all models (before and after outlier removal) into a single DataFrame to facilitate comparison and identify the best-performing model.\n",
|
|
"\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "523502d0",
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 425
|
|
},
|
|
"id": "523502d0",
|
|
"outputId": "08884784-1ebb-42d3-9ef0-e9a3c356babc"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"results_data_cleaned = {\n",
|
|
" 'Linear Regression (Cleaned)': linear_regession_results_cleaned,\n",
|
|
" 'Decision Tree (Cleaned)': desicion_tree_results_cleaned,\n",
|
|
" 'Tuned Decision Tree (Cleaned)': tuned_decision_tree_results_cleaned,\n",
|
|
" 'Gradient Boosting (Cleaned)': gradient_boosting_results_cleaned,\n",
|
|
" 'Tuned Gradient Boosting (Cleaned)': tuned_gradient_boosting_results_cleaned,\n",
|
|
" 'XGBoost (Cleaned)': xgboost_results_cleaned\n",
|
|
"}\n",
|
|
"\n",
|
|
"# Combine with original results\n",
|
|
"combined_results_data = {**results_data, **results_data_cleaned}\n",
|
|
"\n",
|
|
"df_results_cleaned = pd.DataFrame(combined_results_data).T.sort_values(by='r2', ascending=False)\n",
|
|
"df_results_cleaned"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "c946c125",
|
|
"metadata": {
|
|
"id": "c946c125"
|
|
},
|
|
"source": [
|
|
"**Reasoning**:\n",
|
|
"I will generate a bar plot to compare the R-squared scores of all models (before and after outlier removal), similar to the previous R-squared comparison plot, to visually assess the impact of dropping outliers.\n",
|
|
"\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "296d4deb",
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 543
|
|
},
|
|
"id": "296d4deb",
|
|
"outputId": "0e6def42-8483-4e40-debd-d92673958fe9"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"plt.figure(figsize=(12, 7))\n",
|
|
"df_results_cleaned['r2'].sort_values(ascending=False).plot(kind='bar', color='skyblue')\n",
|
|
"plt.title('$R^2$ Score Comparison Across Models (with and without Outliers)', fontsize=14)\n",
|
|
"plt.ylabel('$R^2$ Score')\n",
|
|
"plt.xticks(rotation=45, ha='right')\n",
|
|
"plt.grid(axis='y', linestyle='--', alpha=0.7)\n",
|
|
"plt.tight_layout()\n",
|
|
"plt.show()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "d8271388",
|
|
"metadata": {
|
|
"id": "d8271388"
|
|
},
|
|
"source": [
|
|
"**Reasoning**:\n",
|
|
"I will generate a bar plot to compare the RMSE scores of all models (before and after outlier removal), similar to the previous R-squared comparison plot, to visually assess the impact of dropping outliers.\n",
|
|
"\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "5bf99015",
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/",
|
|
"height": 545
|
|
},
|
|
"id": "5bf99015",
|
|
"outputId": "f87ca59f-aa0d-4540-d8a9-ac6ae4f9b81c"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"plt.figure(figsize=(12, 7))\n",
|
|
"df_results_cleaned['rmse'].sort_values(ascending=True).plot(kind='bar', color='salmon')\n",
|
|
"plt.title('RMSE Comparison Across Models (with and without Outliers)', fontsize=14)\n",
|
|
"plt.ylabel('Root Mean Squared Error (RMSE)')\n",
|
|
"plt.xticks(rotation=45, ha='right')\n",
|
|
"plt.grid(axis='y', linestyle='--', alpha=0.7)\n",
|
|
"plt.tight_layout()\n",
|
|
"plt.show()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "5e2a2267",
|
|
"metadata": {
|
|
"id": "5e2a2267"
|
|
},
|
|
"source": [
|
|
"## Summary of Outlier Treatment Impact on Model Performance\n",
|
|
"\n",
|
|
"After re-running all models with the cleaned data (dropping the two identified outliers), we observe the following:\n",
|
|
"\n",
|
|
"### Impact on Model Performance\n",
|
|
"\n",
|
|
"- **Gradient Boosting Regressor (Untuned)**:\n",
|
|
" - **Before Outlier Removal**: R-squared: 0.9567, RMSE: 92.00, MAE: 43.90\n",
|
|
" - **After Outlier Removal**: R-squared: 0.9553, RMSE: 92.50, MAE: 47.98\n",
|
|
" - **Impact**: While the R-squared remained very high, there was a slight decrease in R-squared and a marginal increase in RMSE and MAE. This suggests that the model was already robust, and removing these specific outliers, though large in magnitude, did not significantly improve its overall performance on the test set. It might have adjusted its predictions slightly for other data points.\n",
|
|
"\n",
|
|
"- **XGBoost Regressor (Untuned)**:\n",
|
|
" - **Before Outlier Removal (Original)**: R-squared: 0.8502, RMSE: 171.10, MAE: 56.83\n",
|
|
" - **After Outlier Removal**: R-squared: 0.9555, RMSE: 92.33, MAE: 46.74\n",
|
|
" - **Impact**: XGBoost showed a remarkable improvement after outlier removal. Its R-squared significantly increased from ~0.85 to ~0.95, and both RMSE and MAE drastically decreased. This indicates that the presence of those extreme outliers heavily skewed its learning, and their removal allowed the model to learn the underlying patterns much more effectively, leading to significantly better predictions.\n",
|
|
"\n",
|
|
"- **Linear Regression**:\n",
|
|
" - **Before Outlier Removal**: R-squared: 0.9236, RMSE: 122.16, MAE: 75.12\n",
|
|
" - **After Outlier Removal**: R-squared: 0.9244, RMSE: 120.33, MAE: 74.76\n",
|
|
" - **Impact**: Linear Regression showed a minor improvement in all metrics (slight increase in R-squared, slight decrease in RMSE and MAE). This is expected as linear models are generally more sensitive to outliers, and their removal can lead to a cleaner fit.\n",
|
|
"\n",
|
|
"- **Decision Tree Regressor (Untuned)**:\n",
|
|
" - **Before Outlier Removal**: R-squared: -0.7529, RMSE: 585.28, MAE: 128.33\n",
|
|
" - **After Outlier Removal**: R-squared: 0.8237, RMSE: 183.78, MAE: 84.32\n",
|
|
" - **Impact**: The untuned Decision Tree saw a massive improvement, changing from a very poor negative R-squared score to a positive and reasonable one. Both RMSE and MAE decreased substantially. This highlights that extreme outliers caused significant overfitting or poor generalization for the unconstrained Decision Tree.\n",
|
|
"\n",
|
|
"- **Tuned Decision Tree Regressor**:\n",
|
|
" - **Before Outlier Removal**: R-squared: 0.8598, RMSE: 165.49, MAE: 69.92\n",
|
|
" - **After Outlier Removal**: R-squared: 0.9177, RMSE: 125.56, MAE: 61.22\n",
|
|
" - **Impact**: The tuned Decision Tree also improved significantly, with R-squared increasing and RMSE/MAE decreasing. Hyperparameter tuning likely made it more robust to the initial outliers than the untuned version, but their removal still yielded better performance.\n",
|
|
"\n",
|
|
"- **Tuned Gradient Boosting Regressor**:\n",
|
|
" - **Before Outlier Removal**: R-squared: 0.9541, RMSE: 94.68, MAE: 44.81\n",
|
|
" - **After Outlier Removal**: R-squared: 0.9206, RMSE: 123.29, MAE: 54.67\n",
|
|
" - **Impact**: Interestingly, the tuned Gradient Boosting Regressor saw a slight decrease in performance (lower R-squared, higher RMSE/MAE) after outlier removal, unlike its untuned counterpart. This could be due to the hyperparameter tuning being optimized for the *original* dataset (including the outliers), and those parameters might not be optimal for the slightly altered distribution of the cleaned dataset. Re-tuning this model on the cleaned data would likely improve its scores.\n",
|
|
"\n",
|
|
"### Improvements in Prediction Accuracy and Generalizability\n",
|
|
"\n",
|
|
"- **Improved Accuracy**: The most significant improvement in prediction accuracy was seen in XGBoost and the untuned Decision Tree, where removing the outliers allowed these models to find more stable and accurate relationships within the data, leading to much lower errors.\n",
|
|
"- **Enhanced Generalizability**: By removing data points that were potentially anomalies or extreme cases, the models, especially XGBoost and Decision Tree, can now generalize better to 'typical' new data. The previous high errors on these outliers likely led to models that were either overly complex (Decision Tree) or struggling to find a consistent mapping (XGBoost).\n",
|
|
"\n",
|
|
"In conclusion, outlier removal proved highly beneficial for several models, particularly for XGBoost and the untuned Decision Tree, significantly boosting their R-squared scores and reducing error metrics. This confirms the importance of identifying and handling influential outliers in the preprocessing phase for improving model performance and robustness.\n",
|
|
"\n",
|
|
"### Best Performing Model\n",
|
|
"\n",
|
|
"Based on the R-squared and RMSE metrics on the cleaned data, the **XGBoost Regressor (Untuned)** and **Gradient Boosting Regressor (Untuned)** models are now the top performers, both achieving an R-squared of approximately 0.955 and RMSE around 92.\n",
|
|
"\n",
|
|
"Specifically:\n",
|
|
"\n",
|
|
"- **Gradient Boosting (Original)**: R^2: 0.9567, RMSE: 92.00 (This is the best overall performance, even compared to cleaned data models)\n",
|
|
"- **XGBoost (Cleaned)**: R^2: 0.9555, RMSE: 92.33\n",
|
|
"- **Gradient Boosting (Cleaned)**: R^2: 0.9553, RMSE: 92.50\n",
|
|
"\n",
|
|
"It appears that the original Gradient Boosting model was already highly effective, and while XGBoost improved dramatically with outlier removal, it didn't surpass the peak performance of the original Gradient Boosting. However, the cleaned XGBoost model is now very competitive."
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"colab": {
|
|
"provenance": []
|
|
},
|
|
"kernelspec": {
|
|
"display_name": "base",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.13.5"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|