# Read data

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
activity_cols = {
 "Datum": "date",
 "Ø Pace": "activity_pace_average",
 "Kalorien": "activity_calories",
 "Zeit": "activity_duration",
 "Ø Herzfrequenz": "activity_heart_rate_average",
 "Ø Atemfrequenz": "activity_breathing_rate_average",
}
df_activities = pd.read_csv(
 "data/raw/all_activities.csv",
 usecols=list(activity_cols.keys())
)
df_activities.rename(columns=activity_cols, inplace=True)

sleep_cols = {
 "Sleep Score 7 Tage": "date",
 "Score": "sleep_score",
 "Dauer": "sleep_duration",
 "Schlafenszeit": "sleep_bedtime",
 "Ruheherzfrequenz": "sleep_resting_heart_rate",
 "HFV-Status": "sleep_hrv_status",
 "Atmung": "sleep_breathing_rate",
 "Schlafbedürfnis": "sleep_duration_needed",
}
df_sleep = pd.read_csv(
 "data/raw/sleep.csv",
 usecols=list(sleep_cols.keys())
)
df_sleep.rename(columns=sleep_cols, inplace=True)

# Clean data/time features

## Activities

In [None]:
def pace_to_seconds(pace_str):
 try:
 if isinstance(pace_str, str) and ":" in pace_str:
 parts = pace_str.split(":")
 if len(parts) == 2:
 minutes, seconds = int(parts[0]), int(parts[1])
 return minutes * 60 + seconds
 elif len(parts) == 3: # If format is HH:MM:SS
 hours, minutes, seconds = int(
 parts[0]), int(parts[1]), int(parts[2])
 return hours * 3600 + minutes * 60 + seconds
 return float(pace_str)
 except Exception:
 return 0


df_activities["date"] = pd.to_datetime(df_activities["date"], errors="coerce")
df_activities["activity_starting_time"] = df_activities["date"] - df_activities["date"].dt.normalize()
df_activities["date"] = df_activities["date"].dt.normalize()

df_activities["activity_pace_average"] = df_activities["activity_pace_average"].apply(pace_to_seconds)
df_activities["activity_duration"] = pd.to_timedelta(df_activities["activity_duration"], errors="coerce")
df_activities["activity_duration_seconds"] = df_activities["activity_duration"].dt.total_seconds().fillna(0).astype(int)

df_activities["activity_ending_time"] = df_activities["activity_starting_time"] + df_activities["activity_duration"]


In [None]:
numeric_columns = [
 "activity_calories",
 "activity_heart_rate_average",
 "activity_pace_average",
 "activity_breathing_rate_average",
]

for col in numeric_columns:
 if df_activities[col].dtype == "object":
 df_activities[col] = pd.to_numeric(df_activities[col].str.replace(',', '').replace('--', '0'), errors='coerce')
 else:
 df_activities[col] = pd.to_numeric(df_activities[col], errors='coerce')

## Sleep

In [None]:
df_sleep["date"] = pd.to_datetime(df_sleep["date"], errors="coerce")

df_sleep = df_sleep[df_sleep["sleep_bedtime"] != "--"]

df_sleep["sleep_bedtime"] = pd.to_timedelta(
 pd.to_datetime(df_sleep["sleep_bedtime"].astype(str), format="%I:%M %p").dt.hour * 3600 +
 pd.to_datetime(df_sleep["sleep_bedtime"].astype(str), format="%I:%M %p").dt.minute * 60,
 unit="s"
)

# Handle AM times by adding 24 hours to times before noon
mask = df_sleep["sleep_bedtime"].dt.components['hours'] < 12
df_sleep.loc[mask, "sleep_bedtime"] = df_sleep.loc[mask, "sleep_bedtime"] + pd.Timedelta(days=1)

df_sleep["sleep_duration"] = pd.to_timedelta(df_sleep["sleep_duration"], errors="coerce")
df_sleep["sleep_duration_seconds"] = (df_sleep["sleep_duration"]).dt.total_seconds()
df_sleep["sleep_duration_needed"] = pd.to_timedelta(df_sleep["sleep_duration_needed"], errors="coerce")
df_sleep["sleep_duration_needed_seconds"] = (df_sleep["sleep_duration_needed"]).dt.total_seconds() 

df_sleep["sleep_duration_needed_delta"] = df_sleep["sleep_duration"] - df_sleep["sleep_duration_needed"]
df_sleep["sleep_duration_needed_delta_seconds"] = (df_sleep["sleep_duration_needed_delta"]).dt.total_seconds()

In [None]:
numeric_columns = [
 "sleep_score",
 "sleep_resting_heart_rate",
 "sleep_hrv_status",
 "sleep_breathing_rate",
]

for col in numeric_columns:
 if df_sleep[col].dtype == "object":
 df_sleep[col] = pd.to_numeric(df_sleep[col].str.replace(',', '').replace('--', '0'), errors='coerce')
 else:
 df_sleep[col] = pd.to_numeric(df_sleep[col], errors='coerce')

## Combined

In [None]:
df_latest_activity = df_activities.sort_values('activity_ending_time').groupby('date', as_index=False).last()

df_combined = pd.merge(df_latest_activity, df_sleep, on='date', how='right')

if "activity_ending_time" in df_combined.columns and "sleep_bedtime" in df_combined.columns:
 df_combined["bedtime_activity_ending_delta"] = df_combined["sleep_bedtime"] - df_combined["activity_ending_time"]


## Save cleaned data

In [None]:
df_combined.to_csv("data/cleaned/combined_activities_sleep.csv", index=False)

## Data overview

In [None]:
df_combined.head(30)

In [None]:
df_combined.dtypes

# Visualizations

## Corelation Matrix sleep after all activities

In [None]:
filtered_all_activities = df_combined[(df_combined["activity_calories"].notna())]
correlation_matrix = df_combined.corr(numeric_only=True)

plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", center=0)
plt.title("Korrelationsmatrix - Alle Aktivitären")
plt.show()

## Corelation Matrix sleep after activities < 4h before sleep

In [None]:

df_combine_activities_4_hours_before_sleep = df_combined[(df_combined["activity_calories"].notna()) & (
 df_combined["bedtime_activity_ending_delta"] < pd.Timedelta(hours=4))]

correlation_matrix = df_combine_activities_4_hours_before_sleep.corr(
 numeric_only=True)

plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", center=0)
plt.title("Korrelationsmatrix- Trainings 4 Stunden vor dem Schlafen")
plt.show()

## Scatter plot

In [None]:
import matplotlib.pyplot as plt

filtered_df_combined_4h_before_sleep = df_combine_activities_4_hours_before_sleep[
 (df_combine_activities_4_hours_before_sleep["activity_calories"].notna())
]

blue_count = len(filtered_df_combined_4h_before_sleep)

plt.figure(figsize=(8, 6))
plt.scatter(
 filtered_df_combined_4h_before_sleep['activity_heart_rate_average'],
 filtered_df_combined_4h_before_sleep['sleep_score'],
 alpha=0.7,
 color='blue',
 label=f'Training <4h vor Schlaf (n={blue_count})'
)
filtered_df_combined_more_than_4h_before_sleep = df_combined[
 (df_combined["activity_calories"].notna()) &
 (df_combined["bedtime_activity_ending_delta"] > pd.Timedelta(hours=4)) # &
]

red_count = len(filtered_df_combined_more_than_4h_before_sleep)
plt.scatter(
 filtered_df_combined_more_than_4h_before_sleep['activity_heart_rate_average'],
 filtered_df_combined_more_than_4h_before_sleep['sleep_score'],
 alpha=0.7,
 color='red',
 label=f'Training ≥4h vor Schlaf (n={red_count})'
)

plt.xlabel('Activity Heart Rate Average')
plt.ylabel('HRV Status That Night')
plt.ylim(60, 110)
plt.grid(True)
plt.legend(title='Gruppe', loc='best')
plt.show()

print(f'Number of blue points: {blue_count}')
print(f'Number of red points: {red_count}')

In [None]:
counts = {
 "Kein Training": len(df_combined[(df_combined["activity_calories"].isna())]),
 "Training weniger als 4h vor Schlaf": len(df_combined[(df_combined["activity_calories"].notna()) & (
 df_combined["bedtime_activity_ending_delta"] < pd.Timedelta(hours=4))]),
 "Training mehr als 4h vor Schlaf": len(df_combined[(df_combined["activity_calories"].notna()) & (
 df_combined["bedtime_activity_ending_delta"] >= pd.Timedelta(hours=4))]),
}

labels = list(counts.keys())
sizes = list(counts.values())

plt.figure(figsize=(6, 6))
plt.pie(
 sizes,
 labels=[f"{lab} ({cnt})" for lab, cnt in zip(labels, sizes)],
 autopct="%1.1f%%",
 startangle=90,
)
plt.title(
 "Verteilung: Kein Training / Training <4h vor Schlaf / Training ≥4h vor Schlaf")
plt.axis("equal")
plt.show()