{ "cells": [ { "cell_type": "markdown", "id": "c2188cd7", "metadata": {}, "source": [ "# Read data" ] }, { "cell_type": "code", "execution_count": null, "id": "5b2b0060", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import seaborn as sns\n", "import matplotlib.pyplot as plt" ] }, { "cell_type": "code", "execution_count": null, "id": "52f55dde", "metadata": {}, "outputs": [], "source": [ "activity_cols = {\n", " \"Datum\": \"date\",\n", " \"Ø Pace\": \"activity_pace_average\",\n", " \"Kalorien\": \"activity_calories\",\n", " \"Zeit\": \"activity_duration\",\n", " \"Ø Herzfrequenz\": \"activity_heart_rate_average\",\n", " \"Ø Atemfrequenz\": \"activity_breathing_rate_average\",\n", "}\n", "df_activities = pd.read_csv(\n", " \"data/raw/all_activities.csv\",\n", " usecols=list(activity_cols.keys())\n", ")\n", "df_activities.rename(columns=activity_cols, inplace=True)\n", "\n", "sleep_cols = {\n", " \"Sleep Score 7 Tage\": \"date\",\n", " \"Score\": \"sleep_score\",\n", " \"Dauer\": \"sleep_duration\",\n", " \"Schlafenszeit\": \"sleep_bedtime\",\n", " \"Ruheherzfrequenz\": \"sleep_resting_heart_rate\",\n", " \"HFV-Status\": \"sleep_hrv_status\",\n", " \"Atmung\": \"sleep_breathing_rate\",\n", " \"Schlafbedürfnis\": \"sleep_duration_needed\",\n", "}\n", "df_sleep = pd.read_csv(\n", " \"data/raw/sleep.csv\",\n", " usecols=list(sleep_cols.keys())\n", ")\n", "df_sleep.rename(columns=sleep_cols, inplace=True)" ] }, { "cell_type": "markdown", "id": "91f04fe7", "metadata": {}, "source": [ "# Clean data/time features" ] }, { "cell_type": "markdown", "id": "97252627", "metadata": {}, "source": [ "## Activities" ] }, { "cell_type": "code", "execution_count": null, "id": "826e5af0", "metadata": {}, "outputs": [], "source": [ "def pace_to_seconds(pace_str):\n", " try:\n", " if isinstance(pace_str, str) and \":\" in pace_str:\n", " parts = pace_str.split(\":\")\n", " if len(parts) == 2:\n", " minutes, seconds = int(parts[0]), int(parts[1])\n", " return minutes * 60 + seconds\n", " elif len(parts) == 3: # If format is HH:MM:SS\n", " hours, minutes, seconds = int(\n", " parts[0]), int(parts[1]), int(parts[2])\n", " return hours * 3600 + minutes * 60 + seconds\n", " return float(pace_str)\n", " except Exception:\n", " return 0\n", "\n", "\n", "df_activities[\"date\"] = pd.to_datetime(df_activities[\"date\"], errors=\"coerce\")\n", "df_activities[\"activity_starting_time\"] = df_activities[\"date\"] - df_activities[\"date\"].dt.normalize()\n", "df_activities[\"date\"] = df_activities[\"date\"].dt.normalize()\n", "\n", "df_activities[\"activity_pace_average\"] = df_activities[\"activity_pace_average\"].apply(pace_to_seconds)\n", "df_activities[\"activity_duration\"] = pd.to_timedelta(df_activities[\"activity_duration\"], errors=\"coerce\")\n", "df_activities[\"activity_duration_seconds\"] = df_activities[\"activity_duration\"].dt.total_seconds().fillna(0).astype(int)\n", "\n", "df_activities[\"activity_ending_time\"] = df_activities[\"activity_starting_time\"] + df_activities[\"activity_duration\"]\n" ] }, { "cell_type": "code", "execution_count": null, "id": "2cbef16d", "metadata": {}, "outputs": [], "source": [ "numeric_columns = [\n", " \"activity_calories\",\n", " \"activity_heart_rate_average\",\n", " \"activity_pace_average\",\n", " \"activity_breathing_rate_average\",\n", "]\n", "\n", "for col in numeric_columns:\n", " if df_activities[col].dtype == \"object\":\n", " df_activities[col] = pd.to_numeric(df_activities[col].str.replace(',', '').replace('--', '0'), errors='coerce')\n", " else:\n", " df_activities[col] = pd.to_numeric(df_activities[col], errors='coerce')" ] }, { "cell_type": "markdown", "id": "708414b6", "metadata": {}, "source": [ "## Sleep" ] }, { "cell_type": "code", "execution_count": null, "id": "d3ec20f3", "metadata": {}, "outputs": [], "source": [ "df_sleep[\"date\"] = pd.to_datetime(df_sleep[\"date\"], errors=\"coerce\")\n", "\n", "df_sleep = df_sleep[df_sleep[\"sleep_bedtime\"] != \"--\"]\n", "\n", "df_sleep[\"sleep_bedtime\"] = pd.to_timedelta(\n", " pd.to_datetime(df_sleep[\"sleep_bedtime\"].astype(str), format=\"%I:%M %p\").dt.hour * 3600 +\n", " pd.to_datetime(df_sleep[\"sleep_bedtime\"].astype(str), format=\"%I:%M %p\").dt.minute * 60,\n", " unit=\"s\"\n", ")\n", "\n", "# Handle AM times by adding 24 hours to times before noon\n", "mask = df_sleep[\"sleep_bedtime\"].dt.components['hours'] < 12\n", "df_sleep.loc[mask, \"sleep_bedtime\"] = df_sleep.loc[mask, \"sleep_bedtime\"] + pd.Timedelta(days=1)\n", "\n", "df_sleep[\"sleep_duration\"] = pd.to_timedelta(df_sleep[\"sleep_duration\"], errors=\"coerce\")\n", "df_sleep[\"sleep_duration_seconds\"] = (df_sleep[\"sleep_duration\"]).dt.total_seconds()\n", "df_sleep[\"sleep_duration_needed\"] = pd.to_timedelta(df_sleep[\"sleep_duration_needed\"], errors=\"coerce\")\n", "df_sleep[\"sleep_duration_needed_seconds\"] = (df_sleep[\"sleep_duration_needed\"]).dt.total_seconds() \n", "\n", "df_sleep[\"sleep_duration_needed_delta\"] = df_sleep[\"sleep_duration\"] - df_sleep[\"sleep_duration_needed\"]\n", "df_sleep[\"sleep_duration_needed_delta_seconds\"] = (df_sleep[\"sleep_duration_needed_delta\"]).dt.total_seconds()" ] }, { "cell_type": "code", "execution_count": null, "id": "845cc713", "metadata": {}, "outputs": [], "source": [ "numeric_columns = [\n", " \"sleep_score\",\n", " \"sleep_resting_heart_rate\",\n", " \"sleep_hrv_status\",\n", " \"sleep_breathing_rate\",\n", "]\n", "\n", "for col in numeric_columns:\n", " if df_sleep[col].dtype == \"object\":\n", " df_sleep[col] = pd.to_numeric(df_sleep[col].str.replace(',', '').replace('--', '0'), errors='coerce')\n", " else:\n", " df_sleep[col] = pd.to_numeric(df_sleep[col], errors='coerce')" ] }, { "cell_type": "markdown", "id": "fabceba5", "metadata": {}, "source": [ "## Combined" ] }, { "cell_type": "code", "execution_count": null, "id": "05da5fe7", "metadata": {}, "outputs": [], "source": [ "df_latest_activity = df_activities.sort_values('activity_ending_time').groupby('date', as_index=False).last()\n", "\n", "df_combined = pd.merge(df_latest_activity, df_sleep, on='date', how='right')\n", "\n", "if \"activity_ending_time\" in df_combined.columns and \"sleep_bedtime\" in df_combined.columns:\n", " df_combined[\"bedtime_activity_ending_delta\"] = df_combined[\"sleep_bedtime\"] - df_combined[\"activity_ending_time\"]\n" ] }, { "cell_type": "markdown", "id": "427af06c", "metadata": {}, "source": [ "## Save cleaned data" ] }, { "cell_type": "code", "execution_count": null, "id": "6a8888ce", "metadata": {}, "outputs": [], "source": [ "df_combined.to_csv(\"data/cleaned/combined_activities_sleep.csv\", index=False)" ] }, { "cell_type": "markdown", "id": "730ab7c4", "metadata": {}, "source": [ "## Data overview" ] }, { "cell_type": "code", "execution_count": null, "id": "47ffa998", "metadata": {}, "outputs": [], "source": [ "df_combined.head(30)" ] }, { "cell_type": "code", "execution_count": null, "id": "ce818c76", "metadata": {}, "outputs": [], "source": [ "df_combined.dtypes" ] }, { "cell_type": "markdown", "id": "4c848335", "metadata": {}, "source": [ "# Visualizations" ] }, { "cell_type": "markdown", "id": "e228914c", "metadata": {}, "source": [ "## Corelation Matrix sleep after all activities" ] }, { "cell_type": "code", "execution_count": null, "id": "41080d47", "metadata": {}, "outputs": [], "source": [ "filtered_all_activities = df_combined[(df_combined[\"activity_calories\"].notna())]\n", "correlation_matrix = df_combined.corr(numeric_only=True)\n", "\n", "plt.figure(figsize=(12, 10))\n", "sns.heatmap(correlation_matrix, annot=True, cmap=\"coolwarm\", center=0)\n", "plt.title(\"Korrelationsmatrix - Alle Aktivitären\")\n", "plt.show()" ] }, { "cell_type": "markdown", "id": "d5ba27c4", "metadata": {}, "source": [ "## Corelation Matrix sleep after activities < 4h before sleep" ] }, { "cell_type": "code", "execution_count": null, "id": "cf54e6c7", "metadata": {}, "outputs": [], "source": [ "\n", "df_combine_activities_4_hours_before_sleep = df_combined[(df_combined[\"activity_calories\"].notna()) & (\n", " df_combined[\"bedtime_activity_ending_delta\"] < pd.Timedelta(hours=4))]\n", "\n", "correlation_matrix = df_combine_activities_4_hours_before_sleep.corr(\n", " numeric_only=True)\n", "\n", "plt.figure(figsize=(12, 10))\n", "sns.heatmap(correlation_matrix, annot=True, cmap=\"coolwarm\", center=0)\n", "plt.title(\"Korrelationsmatrix- Trainings 4 Stunden vor dem Schlafen\")\n", "plt.show()" ] }, { "cell_type": "markdown", "id": "634ee858", "metadata": {}, "source": [ "## Scatter plot" ] }, { "cell_type": "code", "execution_count": null, "id": "e2246df2", "metadata": {}, "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", "\n", "filtered_df_combined_4h_before_sleep = df_combine_activities_4_hours_before_sleep[\n", " (df_combine_activities_4_hours_before_sleep[\"activity_calories\"].notna())\n", "]\n", "\n", "blue_count = len(filtered_df_combined_4h_before_sleep)\n", "\n", "plt.figure(figsize=(8, 6))\n", "plt.scatter(\n", " filtered_df_combined_4h_before_sleep['activity_heart_rate_average'],\n", " filtered_df_combined_4h_before_sleep['sleep_score'],\n", " alpha=0.7,\n", " color='blue',\n", " label=f'Training <4h vor Schlaf (n={blue_count})'\n", ")\n", "filtered_df_combined_more_than_4h_before_sleep = df_combined[\n", " (df_combined[\"activity_calories\"].notna()) &\n", " (df_combined[\"bedtime_activity_ending_delta\"] > pd.Timedelta(hours=4)) # &\n", "]\n", "\n", "red_count = len(filtered_df_combined_more_than_4h_before_sleep)\n", "plt.scatter(\n", " filtered_df_combined_more_than_4h_before_sleep['activity_heart_rate_average'],\n", " filtered_df_combined_more_than_4h_before_sleep['sleep_score'],\n", " alpha=0.7,\n", " color='red',\n", " label=f'Training ≥4h vor Schlaf (n={red_count})'\n", ")\n", "\n", "plt.xlabel('Activity Heart Rate Average')\n", "plt.ylabel('HRV Status That Night')\n", "plt.ylim(60, 110)\n", "plt.grid(True)\n", "plt.legend(title='Gruppe', loc='best')\n", "plt.show()\n", "\n", "print(f'Number of blue points: {blue_count}')\n", "print(f'Number of red points: {red_count}')" ] }, { "cell_type": "code", "execution_count": null, "id": "52f93b1e", "metadata": {}, "outputs": [], "source": [ "counts = {\n", " \"Kein Training\": len(df_combined[(df_combined[\"activity_calories\"].isna())]),\n", " \"Training weniger als 4h vor Schlaf\": len(df_combined[(df_combined[\"activity_calories\"].notna()) & (\n", " df_combined[\"bedtime_activity_ending_delta\"] < pd.Timedelta(hours=4))]),\n", " \"Training mehr als 4h vor Schlaf\": len(df_combined[(df_combined[\"activity_calories\"].notna()) & (\n", " df_combined[\"bedtime_activity_ending_delta\"] >= pd.Timedelta(hours=4))]),\n", "}\n", "\n", "labels = list(counts.keys())\n", "sizes = list(counts.values())\n", "\n", "plt.figure(figsize=(6, 6))\n", "plt.pie(\n", " sizes,\n", " labels=[f\"{lab} ({cnt})\" for lab, cnt in zip(labels, sizes)],\n", " autopct=\"%1.1f%%\",\n", " startangle=90,\n", ")\n", "plt.title(\n", " \"Verteilung: Kein Training / Training <4h vor Schlaf / Training ≥4h vor Schlaf\")\n", "plt.axis(\"equal\")\n", "plt.show()" ] } ], "metadata": { "kernelspec": { "display_name": "base", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.13.5" } }, "nbformat": 4, "nbformat_minor": 5 }