cds-1011-health-data-analysis/activity_sleep.ipynb
2025-10-26 18:25:52 +01:00

430 lines
13 KiB
Plaintext

{
"cells": [
{
"cell_type": "markdown",
"id": "c2188cd7",
"metadata": {},
"source": [
"# Read data"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5b2b0060",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import seaborn as sns\n",
"import matplotlib.pyplot as plt"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "52f55dde",
"metadata": {},
"outputs": [],
"source": [
"activity_cols = {\n",
" \"Datum\": \"date\",\n",
" \"Ø Pace\": \"activity_pace_average\",\n",
" \"Kalorien\": \"activity_calories\",\n",
" \"Zeit\": \"activity_duration\",\n",
" \"Ø Herzfrequenz\": \"activity_heart_rate_average\",\n",
" \"Ø Atemfrequenz\": \"activity_breathing_rate_average\",\n",
"}\n",
"df_activities = pd.read_csv(\n",
" \"data/raw/all_activities.csv\",\n",
" usecols=list(activity_cols.keys())\n",
")\n",
"df_activities.rename(columns=activity_cols, inplace=True)\n",
"\n",
"sleep_cols = {\n",
" \"Sleep Score 7 Tage\": \"date\",\n",
" \"Score\": \"sleep_score\",\n",
" \"Dauer\": \"sleep_duration\",\n",
" \"Schlafenszeit\": \"sleep_bedtime\",\n",
" \"Ruheherzfrequenz\": \"sleep_resting_heart_rate\",\n",
" \"HFV-Status\": \"sleep_hrv_status\",\n",
" \"Atmung\": \"sleep_breathing_rate\",\n",
" \"Schlafbedürfnis\": \"sleep_duration_needed\",\n",
"}\n",
"df_sleep = pd.read_csv(\n",
" \"data/raw/sleep.csv\",\n",
" usecols=list(sleep_cols.keys())\n",
")\n",
"df_sleep.rename(columns=sleep_cols, inplace=True)"
]
},
{
"cell_type": "markdown",
"id": "91f04fe7",
"metadata": {},
"source": [
"# Clean data/time features"
]
},
{
"cell_type": "markdown",
"id": "97252627",
"metadata": {},
"source": [
"## Activities"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "826e5af0",
"metadata": {},
"outputs": [],
"source": [
"def pace_to_seconds(pace_str):\n",
" try:\n",
" if isinstance(pace_str, str) and \":\" in pace_str:\n",
" parts = pace_str.split(\":\")\n",
" if len(parts) == 2:\n",
" minutes, seconds = int(parts[0]), int(parts[1])\n",
" return minutes * 60 + seconds\n",
" elif len(parts) == 3: # If format is HH:MM:SS\n",
" hours, minutes, seconds = int(\n",
" parts[0]), int(parts[1]), int(parts[2])\n",
" return hours * 3600 + minutes * 60 + seconds\n",
" return float(pace_str)\n",
" except Exception:\n",
" return 0\n",
"\n",
"\n",
"df_activities[\"date\"] = pd.to_datetime(df_activities[\"date\"], errors=\"coerce\")\n",
"df_activities[\"activity_starting_time\"] = df_activities[\"date\"] - df_activities[\"date\"].dt.normalize()\n",
"df_activities[\"date\"] = df_activities[\"date\"].dt.normalize()\n",
"\n",
"df_activities[\"activity_pace_average\"] = df_activities[\"activity_pace_average\"].apply(pace_to_seconds)\n",
"df_activities[\"activity_duration\"] = pd.to_timedelta(df_activities[\"activity_duration\"], errors=\"coerce\")\n",
"df_activities[\"activity_duration_seconds\"] = df_activities[\"activity_duration\"].dt.total_seconds().fillna(0).astype(int)\n",
"\n",
"df_activities[\"activity_ending_time\"] = df_activities[\"activity_starting_time\"] + df_activities[\"activity_duration\"]\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2cbef16d",
"metadata": {},
"outputs": [],
"source": [
"numeric_columns = [\n",
" \"activity_calories\",\n",
" \"activity_heart_rate_average\",\n",
" \"activity_pace_average\",\n",
" \"activity_breathing_rate_average\",\n",
"]\n",
"\n",
"for col in numeric_columns:\n",
" if df_activities[col].dtype == \"object\":\n",
" df_activities[col] = pd.to_numeric(df_activities[col].str.replace(',', '').replace('--', '0'), errors='coerce')\n",
" else:\n",
" df_activities[col] = pd.to_numeric(df_activities[col], errors='coerce')"
]
},
{
"cell_type": "markdown",
"id": "708414b6",
"metadata": {},
"source": [
"## Sleep"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d3ec20f3",
"metadata": {},
"outputs": [],
"source": [
"df_sleep[\"date\"] = pd.to_datetime(df_sleep[\"date\"], errors=\"coerce\")\n",
"\n",
"df_sleep = df_sleep[df_sleep[\"sleep_bedtime\"] != \"--\"]\n",
"\n",
"df_sleep[\"sleep_bedtime\"] = pd.to_timedelta(\n",
" pd.to_datetime(df_sleep[\"sleep_bedtime\"].astype(str), format=\"%I:%M %p\").dt.hour * 3600 +\n",
" pd.to_datetime(df_sleep[\"sleep_bedtime\"].astype(str), format=\"%I:%M %p\").dt.minute * 60,\n",
" unit=\"s\"\n",
")\n",
"\n",
"# Handle AM times by adding 24 hours to times before noon\n",
"mask = df_sleep[\"sleep_bedtime\"].dt.components['hours'] < 12\n",
"df_sleep.loc[mask, \"sleep_bedtime\"] = df_sleep.loc[mask, \"sleep_bedtime\"] + pd.Timedelta(days=1)\n",
"\n",
"df_sleep[\"sleep_duration\"] = pd.to_timedelta(df_sleep[\"sleep_duration\"], errors=\"coerce\")\n",
"df_sleep[\"sleep_duration_seconds\"] = (df_sleep[\"sleep_duration\"]).dt.total_seconds()\n",
"df_sleep[\"sleep_duration_needed\"] = pd.to_timedelta(df_sleep[\"sleep_duration_needed\"], errors=\"coerce\")\n",
"df_sleep[\"sleep_duration_needed_seconds\"] = (df_sleep[\"sleep_duration_needed\"]).dt.total_seconds() \n",
"\n",
"df_sleep[\"sleep_duration_needed_delta\"] = df_sleep[\"sleep_duration\"] - df_sleep[\"sleep_duration_needed\"]\n",
"df_sleep[\"sleep_duration_needed_delta_seconds\"] = (df_sleep[\"sleep_duration_needed_delta\"]).dt.total_seconds()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "845cc713",
"metadata": {},
"outputs": [],
"source": [
"numeric_columns = [\n",
" \"sleep_score\",\n",
" \"sleep_resting_heart_rate\",\n",
" \"sleep_hrv_status\",\n",
" \"sleep_breathing_rate\",\n",
"]\n",
"\n",
"for col in numeric_columns:\n",
" if df_sleep[col].dtype == \"object\":\n",
" df_sleep[col] = pd.to_numeric(df_sleep[col].str.replace(',', '').replace('--', '0'), errors='coerce')\n",
" else:\n",
" df_sleep[col] = pd.to_numeric(df_sleep[col], errors='coerce')"
]
},
{
"cell_type": "markdown",
"id": "fabceba5",
"metadata": {},
"source": [
"## Combined"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "05da5fe7",
"metadata": {},
"outputs": [],
"source": [
"df_latest_activity = df_activities.sort_values('activity_ending_time').groupby('date', as_index=False).last()\n",
"\n",
"df_combined = pd.merge(df_latest_activity, df_sleep, on='date', how='right')\n",
"\n",
"if \"activity_ending_time\" in df_combined.columns and \"sleep_bedtime\" in df_combined.columns:\n",
" df_combined[\"bedtime_activity_ending_delta\"] = df_combined[\"sleep_bedtime\"] - df_combined[\"activity_ending_time\"]\n"
]
},
{
"cell_type": "markdown",
"id": "427af06c",
"metadata": {},
"source": [
"## Save cleaned data"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6a8888ce",
"metadata": {},
"outputs": [],
"source": [
"df_combined.to_csv(\"data/cleaned/combined_activities_sleep.csv\", index=False)"
]
},
{
"cell_type": "markdown",
"id": "730ab7c4",
"metadata": {},
"source": [
"## Data overview"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "47ffa998",
"metadata": {},
"outputs": [],
"source": [
"df_combined.head(30)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ce818c76",
"metadata": {},
"outputs": [],
"source": [
"df_combined.dtypes"
]
},
{
"cell_type": "markdown",
"id": "4c848335",
"metadata": {},
"source": [
"# Visualizations"
]
},
{
"cell_type": "markdown",
"id": "e228914c",
"metadata": {},
"source": [
"## Corelation Matrix sleep after all activities"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "41080d47",
"metadata": {},
"outputs": [],
"source": [
"filtered_all_activities = df_combined[(df_combined[\"activity_calories\"].notna())]\n",
"correlation_matrix = df_combined.corr(numeric_only=True)\n",
"\n",
"plt.figure(figsize=(12, 10))\n",
"sns.heatmap(correlation_matrix, annot=True, cmap=\"coolwarm\", center=0)\n",
"plt.title(\"Korrelationsmatrix - Alle Aktivitären\")\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"id": "d5ba27c4",
"metadata": {},
"source": [
"## Corelation Matrix sleep after activities < 4h before sleep"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "cf54e6c7",
"metadata": {},
"outputs": [],
"source": [
"\n",
"df_combine_activities_4_hours_before_sleep = df_combined[(df_combined[\"activity_calories\"].notna()) & (\n",
" df_combined[\"bedtime_activity_ending_delta\"] < pd.Timedelta(hours=4))]\n",
"\n",
"correlation_matrix = df_combine_activities_4_hours_before_sleep.corr(\n",
" numeric_only=True)\n",
"\n",
"plt.figure(figsize=(12, 10))\n",
"sns.heatmap(correlation_matrix, annot=True, cmap=\"coolwarm\", center=0)\n",
"plt.title(\"Korrelationsmatrix- Trainings 4 Stunden vor dem Schlafen\")\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"id": "634ee858",
"metadata": {},
"source": [
"## Scatter plot"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e2246df2",
"metadata": {},
"outputs": [],
"source": [
"import matplotlib.pyplot as plt\n",
"\n",
"filtered_df_combined_4h_before_sleep = df_combine_activities_4_hours_before_sleep[\n",
" (df_combine_activities_4_hours_before_sleep[\"activity_calories\"].notna())\n",
"]\n",
"\n",
"blue_count = len(filtered_df_combined_4h_before_sleep)\n",
"\n",
"plt.figure(figsize=(8, 6))\n",
"plt.scatter(\n",
" filtered_df_combined_4h_before_sleep['activity_heart_rate_average'],\n",
" filtered_df_combined_4h_before_sleep['sleep_score'],\n",
" alpha=0.7,\n",
" color='blue',\n",
" label=f'Training <4h vor Schlaf (n={blue_count})'\n",
")\n",
"filtered_df_combined_more_than_4h_before_sleep = df_combined[\n",
" (df_combined[\"activity_calories\"].notna()) &\n",
" (df_combined[\"bedtime_activity_ending_delta\"] > pd.Timedelta(hours=4)) # &\n",
"]\n",
"\n",
"red_count = len(filtered_df_combined_more_than_4h_before_sleep)\n",
"plt.scatter(\n",
" filtered_df_combined_more_than_4h_before_sleep['activity_heart_rate_average'],\n",
" filtered_df_combined_more_than_4h_before_sleep['sleep_score'],\n",
" alpha=0.7,\n",
" color='red',\n",
" label=f'Training ≥4h vor Schlaf (n={red_count})'\n",
")\n",
"\n",
"plt.xlabel('Activity Heart Rate Average')\n",
"plt.ylabel('HRV Status That Night')\n",
"plt.ylim(60, 110)\n",
"plt.grid(True)\n",
"plt.legend(title='Gruppe', loc='best')\n",
"plt.show()\n",
"\n",
"print(f'Number of blue points: {blue_count}')\n",
"print(f'Number of red points: {red_count}')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "52f93b1e",
"metadata": {},
"outputs": [],
"source": [
"counts = {\n",
" \"Kein Training\": len(df_combined[(df_combined[\"activity_calories\"].isna())]),\n",
" \"Training weniger als 4h vor Schlaf\": len(df_combined[(df_combined[\"activity_calories\"].notna()) & (\n",
" df_combined[\"bedtime_activity_ending_delta\"] < pd.Timedelta(hours=4))]),\n",
" \"Training mehr als 4h vor Schlaf\": len(df_combined[(df_combined[\"activity_calories\"].notna()) & (\n",
" df_combined[\"bedtime_activity_ending_delta\"] >= pd.Timedelta(hours=4))]),\n",
"}\n",
"\n",
"labels = list(counts.keys())\n",
"sizes = list(counts.values())\n",
"\n",
"plt.figure(figsize=(6, 6))\n",
"plt.pie(\n",
" sizes,\n",
" labels=[f\"{lab} ({cnt})\" for lab, cnt in zip(labels, sizes)],\n",
" autopct=\"%1.1f%%\",\n",
" startangle=90,\n",
")\n",
"plt.title(\n",
" \"Verteilung: Kein Training / Training <4h vor Schlaf / Training ≥4h vor Schlaf\")\n",
"plt.axis(\"equal\")\n",
"plt.show()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "base",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.5"
}
},
"nbformat": 4,
"nbformat_minor": 5
}