{ "cells": [ { "cell_type": "markdown", "id": "f6f0a842", "metadata": {}, "source": [ "# Exploratory Data Analysis" ] }, { "cell_type": "markdown", "id": "386dcd6d", "metadata": {}, "source": [ "import the required libraries" ] }, { "cell_type": "code", "execution_count": 5, "id": "f9b0ae4c", "metadata": {}, "outputs": [], "source": [ "#importing the required libraries\n", "import numpy as np \n", "import matplotlib.pyplot as plt\n", "import pandas as pd\n", "import seaborn as sns\n", "import os \n" ] }, { "cell_type": "code", "execution_count": 25, "id": "1c92d2c2", "metadata": {}, "outputs": [], "source": [ "#load the datasets\n", "df_activities = pd.read_csv(\"all_activities.csv\")\n", "df_sleep = pd.read_csv(\"sleep.csv\")" ] }, { "cell_type": "code", "execution_count": 26, "id": "4f18cf6a", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AktivitätstypDatumFavoritTitelDistanzKalorienZeitØ HerzfrequenzMaximale HerzfrequenzAerober TE...Maximale AtemfrequenzStressänderungStress: StartStress: EndeØ StressMaximaler StressZeit in BewegungVerstrichene ZeitMinimale HöheMaximale Höhe
0Laufen2025-09-21 09:53:57FalseBerlin - BMW Berlin Marathon (42.195 km)42.652,81704:35:151481655.0...41----------04:34:5504:35:153356
1Laufen2025-09-20 15:18:50FalseBerlin Laufen3.7224600:20:131441612.8...38----------00:20:1200:20:134047
2Virtuelles Radfahren2025-09-19 12:31:00FalseZwift - Renewal on Bridges and Boardwalks in M...12.8121000:30:181161341.5...------------00:30:1700:30:17459
3Laufen2025-09-15 11:16:13FalseRüthi Laufen5.6035000:30:391331512.8...39----------00:30:3700:30:39421437
4Laufen2025-09-13 10:51:36FalseRüthi Laufen9.0154900:44:561441713.5...41----------00:44:4500:44:56421435
\n", "

5 rows × 52 columns

\n", "
" ], "text/plain": [ " Aktivitätstyp Datum Favorit \\\n", "0 Laufen 2025-09-21 09:53:57 False \n", "1 Laufen 2025-09-20 15:18:50 False \n", "2 Virtuelles Radfahren 2025-09-19 12:31:00 False \n", "3 Laufen 2025-09-15 11:16:13 False \n", "4 Laufen 2025-09-13 10:51:36 False \n", "\n", " Titel Distanz Kalorien \\\n", "0 Berlin - BMW Berlin Marathon (42.195 km) 42.65 2,817 \n", "1 Berlin Laufen 3.72 246 \n", "2 Zwift - Renewal on Bridges and Boardwalks in M... 12.81 210 \n", "3 Rüthi Laufen 5.60 350 \n", "4 Rüthi Laufen 9.01 549 \n", "\n", " Zeit Ø Herzfrequenz Maximale Herzfrequenz Aerober TE ... \\\n", "0 04:35:15 148 165 5.0 ... \n", "1 00:20:13 144 161 2.8 ... \n", "2 00:30:18 116 134 1.5 ... \n", "3 00:30:39 133 151 2.8 ... \n", "4 00:44:56 144 171 3.5 ... \n", "\n", " Maximale Atemfrequenz Stressänderung Stress: Start Stress: Ende Ø Stress \\\n", "0 41 -- -- -- -- \n", "1 38 -- -- -- -- \n", "2 -- -- -- -- -- \n", "3 39 -- -- -- -- \n", "4 41 -- -- -- -- \n", "\n", " Maximaler Stress Zeit in Bewegung Verstrichene Zeit Minimale Höhe \\\n", "0 -- 04:34:55 04:35:15 33 \n", "1 -- 00:20:12 00:20:13 40 \n", "2 -- 00:30:17 00:30:17 4 \n", "3 -- 00:30:37 00:30:39 421 \n", "4 -- 00:44:45 00:44:56 421 \n", "\n", " Maximale Höhe \n", "0 56 \n", "1 47 \n", "2 59 \n", "3 437 \n", "4 435 \n", "\n", "[5 rows x 52 columns]" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# check activities data\n", "df_activities.head()\n" ] }, { "cell_type": "code", "execution_count": 27, "id": "e65aa687", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Sleep Score 7 TageScoreRuheherzfrequenzBody BatteryPulsoximeterAtmungHFV-StatusQualitätDauerSchlafbedürfnisSchlafenszeitAufstehzeit;;;;;;;;;;;
02025-09-30774455--11.3883Ausreichend6h 47min7h 0min11:22 PM6:09 AM;;;;;;;;;;;
12025-09-29734662--1284Ausreichend9h 6min8h 40min10:52 PM8:02 AM;;;;;;;;;;;
22025-09-28344721--14.9684Schlecht6h 34min7h 40min12:34 AM8:09 AM;;;;;;;;;;;
32025-09-27934167--11.1392Ausgezeichnet8h 32min7h 40min10:39 PM7:20 AM;;;;;;;;;;;
42025-09-26974271--11.1590Ausgezeichnet7h 50min7h 40min10:07 PM5:57 AM;;;;;;;;;;;
\n", "
" ], "text/plain": [ " Sleep Score 7 Tage Score Ruheherzfrequenz Body Battery Pulsoximeter Atmung \\\n", "0 2025-09-30 77 44 55 -- 11.38 \n", "1 2025-09-29 73 46 62 -- 12 \n", "2 2025-09-28 34 47 21 -- 14.96 \n", "3 2025-09-27 93 41 67 -- 11.13 \n", "4 2025-09-26 97 42 71 -- 11.15 \n", "\n", " HFV-Status Qualität Dauer Schlafbedürfnis Schlafenszeit \\\n", "0 83 Ausreichend 6h 47min 7h 0min 11:22 PM \n", "1 84 Ausreichend 9h 6min 8h 40min 10:52 PM \n", "2 84 Schlecht 6h 34min 7h 40min 12:34 AM \n", "3 92 Ausgezeichnet 8h 32min 7h 40min 10:39 PM \n", "4 90 Ausgezeichnet 7h 50min 7h 40min 10:07 PM \n", "\n", " Aufstehzeit;;;;;;;;;;; \n", "0 6:09 AM;;;;;;;;;;; \n", "1 8:02 AM;;;;;;;;;;; \n", "2 8:09 AM;;;;;;;;;;; \n", "3 7:20 AM;;;;;;;;;;; \n", "4 5:57 AM;;;;;;;;;;; " ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# check sleep data\n", "df_sleep.head()" ] }, { "cell_type": "code", "execution_count": 28, "id": "2b832a91", "metadata": {}, "outputs": [], "source": [ "# change the first column name to Datum\n", "df_sleep.rename(columns={df_sleep.columns[0]: 'Datum'}, inplace=True)" ] }, { "cell_type": "code", "execution_count": 29, "id": "70fe281d", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
DatumScoreRuheherzfrequenzBody BatteryPulsoximeterAtmungHFV-StatusQualitätDauerSchlafbedürfnisSchlafenszeitAufstehzeit;;;;;;;;;;;
02025-09-30774455--11.3883Ausreichend6h 47min7h 0min11:22 PM6:09 AM;;;;;;;;;;;
12025-09-29734662--1284Ausreichend9h 6min8h 40min10:52 PM8:02 AM;;;;;;;;;;;
22025-09-28344721--14.9684Schlecht6h 34min7h 40min12:34 AM8:09 AM;;;;;;;;;;;
32025-09-27934167--11.1392Ausgezeichnet8h 32min7h 40min10:39 PM7:20 AM;;;;;;;;;;;
42025-09-26974271--11.1590Ausgezeichnet7h 50min7h 40min10:07 PM5:57 AM;;;;;;;;;;;
\n", "
" ], "text/plain": [ " Datum Score Ruheherzfrequenz Body Battery Pulsoximeter Atmung \\\n", "0 2025-09-30 77 44 55 -- 11.38 \n", "1 2025-09-29 73 46 62 -- 12 \n", "2 2025-09-28 34 47 21 -- 14.96 \n", "3 2025-09-27 93 41 67 -- 11.13 \n", "4 2025-09-26 97 42 71 -- 11.15 \n", "\n", " HFV-Status Qualität Dauer Schlafbedürfnis Schlafenszeit \\\n", "0 83 Ausreichend 6h 47min 7h 0min 11:22 PM \n", "1 84 Ausreichend 9h 6min 8h 40min 10:52 PM \n", "2 84 Schlecht 6h 34min 7h 40min 12:34 AM \n", "3 92 Ausgezeichnet 8h 32min 7h 40min 10:39 PM \n", "4 90 Ausgezeichnet 7h 50min 7h 40min 10:07 PM \n", "\n", " Aufstehzeit;;;;;;;;;;; \n", "0 6:09 AM;;;;;;;;;;; \n", "1 8:02 AM;;;;;;;;;;; \n", "2 8:09 AM;;;;;;;;;;; \n", "3 7:20 AM;;;;;;;;;;; \n", "4 5:57 AM;;;;;;;;;;; " ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# check the sleep data again\n", "df_sleep.head()" ] }, { "cell_type": "code", "execution_count": 30, "id": "daebf9ac", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
AktivitätstypDatumFavoritTitelDistanzKalorienZeitØ HerzfrequenzMaximale HerzfrequenzAerober TE...Maximale AtemfrequenzStressänderungStress: StartStress: EndeØ StressMaximaler StressZeit in BewegungVerstrichene ZeitMinimale HöheMaximale Höhe
0Laufen2025-09-21 09:53:57FalseBerlin - BMW Berlin Marathon (42.195 km)42.652,81704:35:151481655.0...41----------04:34:5504:35:153356
1Laufen2025-09-20 15:18:50FalseBerlin Laufen3.7224600:20:131441612.8...38----------00:20:1200:20:134047
2Virtuelles Radfahren2025-09-19 12:31:00FalseZwift - Renewal on Bridges and Boardwalks in M...12.8121000:30:181161341.5...------------00:30:1700:30:17459
3Laufen2025-09-15 11:16:13FalseRüthi Laufen5.6035000:30:391331512.8...39----------00:30:3700:30:39421437
4Laufen2025-09-13 10:51:36FalseRüthi Laufen9.0154900:44:561441713.5...41----------00:44:4500:44:56421435
\n", "

5 rows × 52 columns

\n", "
" ], "text/plain": [ " Aktivitätstyp Datum Favorit \\\n", "0 Laufen 2025-09-21 09:53:57 False \n", "1 Laufen 2025-09-20 15:18:50 False \n", "2 Virtuelles Radfahren 2025-09-19 12:31:00 False \n", "3 Laufen 2025-09-15 11:16:13 False \n", "4 Laufen 2025-09-13 10:51:36 False \n", "\n", " Titel Distanz Kalorien \\\n", "0 Berlin - BMW Berlin Marathon (42.195 km) 42.65 2,817 \n", "1 Berlin Laufen 3.72 246 \n", "2 Zwift - Renewal on Bridges and Boardwalks in M... 12.81 210 \n", "3 Rüthi Laufen 5.60 350 \n", "4 Rüthi Laufen 9.01 549 \n", "\n", " Zeit Ø Herzfrequenz Maximale Herzfrequenz Aerober TE ... \\\n", "0 04:35:15 148 165 5.0 ... \n", "1 00:20:13 144 161 2.8 ... \n", "2 00:30:18 116 134 1.5 ... \n", "3 00:30:39 133 151 2.8 ... \n", "4 00:44:56 144 171 3.5 ... \n", "\n", " Maximale Atemfrequenz Stressänderung Stress: Start Stress: Ende Ø Stress \\\n", "0 41 -- -- -- -- \n", "1 38 -- -- -- -- \n", "2 -- -- -- -- -- \n", "3 39 -- -- -- -- \n", "4 41 -- -- -- -- \n", "\n", " Maximaler Stress Zeit in Bewegung Verstrichene Zeit Minimale Höhe \\\n", "0 -- 04:34:55 04:35:15 33 \n", "1 -- 00:20:12 00:20:13 40 \n", "2 -- 00:30:17 00:30:17 4 \n", "3 -- 00:30:37 00:30:39 421 \n", "4 -- 00:44:45 00:44:56 421 \n", "\n", " Maximale Höhe \n", "0 56 \n", "1 47 \n", "2 59 \n", "3 437 \n", "4 435 \n", "\n", "[5 rows x 52 columns]" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_activities.head()" ] }, { "cell_type": "markdown", "id": "119e7f65", "metadata": {}, "source": [ "Um die Frage \"Wie sich der Sport auf die Ruheherzfrequenz (RHF) auswirkt\" zu beantworten, erstellen wir einen neuen, kombinierten Datensatz.\n", "\n", "Dafür nutzen wir:\n", "\n", "Aus sleep.csv: Das Date (Datum) und die RHR (Ruheherzfrequenz). Dies ist der Wert, den wir messen möchten.\n", "\n", "Aus all-activities.csv: Den Activity_Timestamp (Zeitstempel der Aktivität) und die Calorie (Kalorienverbrauch).\n", "\n", "Zuerst summieren wir die Calorie-Werte pro Datum in der all-activities.csv, um die tägliche Gesamtaktivität zu erhalten. Anschließend verbinden wir diese täglichen Aktivitätsdaten mit den RHF-Werten aus der sleep.csv über das gemeinsame Datum." ] }, { "cell_type": "code", "execution_count": 32, "id": "411dec6a", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
DatumRuheherzfrequenz
02025-09-3044
12025-09-2946
22025-09-2847
32025-09-2741
42025-09-2642
\n", "
" ], "text/plain": [ " Datum Ruheherzfrequenz\n", "0 2025-09-30 44\n", "1 2025-09-29 46\n", "2 2025-09-28 47\n", "3 2025-09-27 41\n", "4 2025-09-26 42" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Pull the Datum and Ruheherzfrequenz from sleep.csv\n", "df_sleep_filtered = df_sleep[['Datum', 'Ruheherzfrequenz']]\n", "df_sleep_filtered.head()\n" ] }, { "cell_type": "code", "execution_count": 33, "id": "8876cc1f", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
DatumKalorien
02025-09-21 09:53:572,817
12025-09-20 15:18:50246
22025-09-19 12:31:00210
32025-09-15 11:16:13350
42025-09-13 10:51:36549
\n", "
" ], "text/plain": [ " Datum Kalorien\n", "0 2025-09-21 09:53:57 2,817\n", "1 2025-09-20 15:18:50 246\n", "2 2025-09-19 12:31:00 210\n", "3 2025-09-15 11:16:13 350\n", "4 2025-09-13 10:51:36 549" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Pull the Datum and Kalorien from all-activities.csv\n", "df_activities_filtered = df_activities[['Datum', 'Kalorien']]\n", "df_activities_filtered.head()\n" ] }, { "cell_type": "code", "execution_count": 34, "id": "54d3116d", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\hizlanarif\\AppData\\Local\\Temp\\ipykernel_3384\\2850544847.py:2: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_activities_filtered['Datum'] = pd.to_datetime(df_activities_filtered['Datum']).dt.date\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
DatumKalorien
02025-09-212,817
12025-09-20246
22025-09-19210
32025-09-15350
42025-09-13549
\n", "
" ], "text/plain": [ " Datum Kalorien\n", "0 2025-09-21 2,817\n", "1 2025-09-20 246\n", "2 2025-09-19 210\n", "3 2025-09-15 350\n", "4 2025-09-13 549" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# for the Datum column, get rid of the time part and keep only the date part\n", "df_activities_filtered['Datum'] = pd.to_datetime(df_activities_filtered['Datum']).dt.date\n", "df_activities_filtered.head()" ] }, { "cell_type": "code", "execution_count": 48, "id": "f26dc1cf", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
DatumKalorien
02025-09-212817
12025-09-20246
22025-09-19210
32025-09-15350
42025-09-13549
\n", "
" ], "text/plain": [ " Datum Kalorien\n", "0 2025-09-21 2817\n", "1 2025-09-20 246\n", "2 2025-09-19 210\n", "3 2025-09-15 350\n", "4 2025-09-13 549" ] }, "execution_count": 48, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_activities_filtered.head()" ] }, { "cell_type": "code", "execution_count": 39, "id": "2caa80da", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Datum object\n", "Kalorien object\n", "dtype: object" ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# check the data types\n", "df_activities_filtered.dtypes" ] }, { "cell_type": "code", "execution_count": 42, "id": "503b1ae9", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\hizlanarif\\AppData\\Local\\Temp\\ipykernel_3384\\2726110581.py:2: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_activities_filtered['Datum'] = pd.to_datetime(df_activities_filtered['Datum'])\n", "C:\\Users\\hizlanarif\\AppData\\Local\\Temp\\ipykernel_3384\\2726110581.py:4: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_activities_filtered['Kalorien'] = df_activities_filtered['Kalorien'].str.replace(',', '')\n", "C:\\Users\\hizlanarif\\AppData\\Local\\Temp\\ipykernel_3384\\2726110581.py:6: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_activities_filtered['Kalorien'] = df_activities_filtered['Kalorien'].replace('--', '0')\n", "C:\\Users\\hizlanarif\\AppData\\Local\\Temp\\ipykernel_3384\\2726110581.py:8: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_activities_filtered['Kalorien'] = df_activities_filtered['Kalorien'].astype(int)\n" ] }, { "data": { "text/plain": [ "Datum datetime64[ns]\n", "Kalorien int64\n", "dtype: object" ] }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# change the data type of Datum to datetime\n", "df_activities_filtered['Datum'] = pd.to_datetime(df_activities_filtered['Datum'])\n", "#get rid of commas in Kalorien column\n", "df_activities_filtered['Kalorien'] = df_activities_filtered['Kalorien'].str.replace(',', '')\n", "# replace all \"--\" values with 0\n", "df_activities_filtered['Kalorien'] = df_activities_filtered['Kalorien'].replace('--', '0')\n", "# change the data type of Kalorien to integer\n", "df_activities_filtered['Kalorien'] = df_activities_filtered['Kalorien'].astype(int)\n", "df_activities_filtered.dtypes" ] }, { "cell_type": "code", "execution_count": 49, "id": "3ca1d34a", "metadata": {}, "outputs": [], "source": [ "#write this cleaned data to a new csv file\n", "df_activities_filtered.to_csv('cleaned_activities.csv', index=False)" ] }, { "cell_type": "code", "execution_count": 50, "id": "8e6a8924", "metadata": {}, "outputs": [], "source": [ "#write the cleaned sleep data to new csv files\n", "df_sleep_filtered.to_csv('cleaned_sleep.csv', index=False)\n" ] }, { "cell_type": "code", "execution_count": 43, "id": "c19c4423", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "False" ] }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# check if each date is unique in df_activities_filtered\n", "df_activities_filtered['Datum'].is_unique" ] }, { "cell_type": "code", "execution_count": 51, "id": "8c8dfaa6", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\hizlanarif\\AppData\\Local\\Temp\\ipykernel_3384\\2358408937.py:2: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " df_activities_filtered['Datum'] = pd.to_datetime(df_activities_filtered['Datum'], format='%d.%m.%Y').dt.date\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
DatumKalorien
02025-09-212817
12025-09-20246
22025-09-19210
32025-09-15350
42025-09-13549
\n", "
" ], "text/plain": [ " Datum Kalorien\n", "0 2025-09-21 2817\n", "1 2025-09-20 246\n", "2 2025-09-19 210\n", "3 2025-09-15 350\n", "4 2025-09-13 549" ] }, "execution_count": 51, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# change the Datum column to day month year format\n", "df_activities_filtered['Datum'] = pd.to_datetime(df_activities_filtered['Datum'], format='%d.%m.%Y').dt.date\n", "df_activities_filtered.head()" ] }, { "cell_type": "markdown", "id": "0d0dd445", "metadata": {}, "source": [ "We see that each value is not unique in Datum which suggests that for some days there are more than one entry." ] }, { "cell_type": "code", "execution_count": 44, "id": "31a6f98d", "metadata": {}, "outputs": [], "source": [ "# sum the Kalorien values per Date in all-activities.csv to get daily total activity\n", "df_activities_daily = df_activities_filtered.groupby('Datum').sum().reset_index()" ] }, { "cell_type": "code", "execution_count": 45, "id": "864c302b", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
DatumKalorien
02021-07-07432
12021-07-08544
22021-07-12441
32021-07-13384
42021-08-20891
\n", "
" ], "text/plain": [ " Datum Kalorien\n", "0 2021-07-07 432\n", "1 2021-07-08 544\n", "2 2021-07-12 441\n", "3 2021-07-13 384\n", "4 2021-08-20 891" ] }, "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_activities_daily.head()" ] }, { "cell_type": "code", "execution_count": 46, "id": "b4c67f8e", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#check if the dates are unique now\n", "df_activities_daily['Datum'].is_unique" ] }, { "cell_type": "code", "execution_count": null, "id": "3965ac4a", "metadata": {}, "outputs": [], "source": [ "# Sum the Calorie values per Date in all-activities.csv to get daily total activity\n", "df_activities_daily = df_activities_filtered.resample('D', on='Datum').sum().reset_index()\n", "# Merge the daily activity data with the RHR data from sleep.csv on the Date\n", "df_combined = pd.merge(df_activities_daily, df_sleep[['Datum', 'Ruheherzfrequenz']], left_on='Datum', right_on='Datum', how='inner')\n", "df_combined.head()" ] } ], "metadata": { "kernelspec": { "display_name": "base", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.13.5" } }, "nbformat": 4, "nbformat_minor": 5 }