cds_introduction_data_scien.../code/corelation.py

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from datetime import datetime

# Manuelle Zuordnung der Monatsnamen von Deutsch auf Englisch
month_translation = {
    'Jan': 'Jan', 'Feb': 'Feb', 'Mär': 'Mar', 'Mrz': 'Mar', 'Apr': 'Apr', 'Mai': 'May', 'Jun': 'Jun',
    'Jul': 'Jul', 'Aug': 'Aug', 'Sep': 'Sep', 'Okt': 'Oct', 'Nov': 'Nov', 'Dez': 'Dec'
}

# Funktion zur Umwandlung von '6h 11min' in numerische Stundenwerte
def convert_sleep_duration(sleep_duration_str):
    hours = 0
    minutes = 0
    if 'h' in sleep_duration_str:
        hours_part = sleep_duration_str.split('h')[0].strip()
        hours = int(hours_part)
    if 'min' in sleep_duration_str:
        minutes_part = sleep_duration_str.split('h')[-1].replace('min', '').strip()
        minutes = int(minutes_part)
    return hours + (minutes / 60)

# Funktion, um Datumsbereiche in Kalenderwoche und Jahr zu konvertieren
def convert_to_week_and_year(date_range_str):
    date_range_str = date_range_str.replace(" - ", "-").replace(",", "")

    if "-" not in date_range_str and len(date_range_str.split(" ")) == 2:
        month_str, day_str = date_range_str.split(" ")
        day = int(day_str.strip())
        year_str = str(datetime.now().year)

        if month_str in month_translation:
            month_str = month_translation[month_str]

        start_date = datetime.strptime(f"{month_str} {day} {year_str}", "%b %d %Y")
        week_number = start_date.isocalendar()[1]
        year = start_date.year

        return f"W{week_number}-{year}"

    if date_range_str[-4:].isdigit():
        year_str = date_range_str[-4:]
        date_range_str = date_range_str[:-5]
    else:
        year_str = str(datetime.now().year)

    start_part, end_part = date_range_str.split("-")
    start_parts = start_part.split(" ")
    start_month_str = start_parts[0]
    start_day = int(start_parts[1].strip())
    end_parts = end_part.split(" ")

    if len(end_parts) == 2:
        end_month_str = end_parts[0]
        end_day = int(end_parts[1].strip())
    else:
        end_month_str = start_month_str
        end_day = int(end_parts[0].strip())

    if start_month_str in month_translation:
        start_month_str = month_translation[start_month_str]
    if end_month_str in month_translation:
        end_month_str = month_translation[end_month_str]

    start_date = datetime.strptime(f"{start_month_str} {start_day} {year_str}", "%b %d %Y")
    week_number = start_date.isocalendar()[1]
    year = start_date.year

    return f"W{week_number}-{year}"

# Datei Pfade
hr_data_path = '/home/gra/PycharmProjects/cds_introduction_data_science_assignment/data/raw/hr_gramic.csv'
sleep_data_path = '/home/gra/PycharmProjects/cds_introduction_data_science_assignment/data/sandbox/sleep_gramic.csv'
hr_clean_path = '/home/gra/PycharmProjects/cds_introduction_data_science_assignment/data/sandbox/hr_data_clean.csv'
sleep_clean_path = '/home/gra/PycharmProjects/cds_introduction_data_science_assignment/data/sandbox/sleep_data_clean.csv'
combined_data_path = '/home/gra/PycharmProjects/cds_introduction_data_science_assignment/data/sandbox/combined_data.csv'
graphic_corr_path = '/home/gra/PycharmProjects/cds_introduction_data_science_assignment/data/final/gramic_sleep_hr_correlation.png'
graphic_weekly_path = '/home/gra/PycharmProjects/cds_introduction_data_science_assignment/data/final/weekly_hr_sleep.png'

# Schritt 1: Lade die HR-Daten und entferne 'bpm'
hr_data = pd.read_csv(hr_data_path, sep=';')
hr_data['In Ruhe'] = hr_data['In Ruhe'].str.replace(' bpm', '').astype(float)
hr_data['Hoch'] = hr_data['Hoch'].str.replace(' bpm', '').astype(float)
hr_data['Woche'] = hr_data['Datum'].apply(convert_to_week_and_year)
hr_data['avg_hr'] = hr_data[['In Ruhe', 'Hoch']].mean(axis=1)
hr_data_clean = hr_data[['Woche', 'avg_hr']]
hr_data_clean.to_csv(hr_clean_path, index=False)

# Schritt 2: Lade die Schlafdaten
sleep_data = pd.read_csv(sleep_data_path, sep=';')
sleep_data['Woche'] = sleep_data['Datum'].apply(convert_to_week_and_year)
sleep_data['Durchschnittliche Dauer'] = sleep_data['Durchschnittliche Dauer'].apply(convert_sleep_duration)
sleep_data_clean = sleep_data[['Woche', 'Durchschnittliche Dauer']]
sleep_data_clean.to_csv(sleep_clean_path, index=False)

# Schritt 3: Kombiniere die HR- und Schlafdaten
combined_data = pd.merge(hr_data_clean, sleep_data_clean, on='Woche', how='inner')
combined_data.to_csv(combined_data_path, index=False)

# Schritt 4: Berechne die Korrelation
correlation = combined_data['avg_hr'].corr(combined_data['Durchschnittliche Dauer'])
print(f"Die Korrelation zwischen der durchschnittlichen Herzfrequenz und der Schlafdauer ist: {correlation}")

# # Schritt 5: Visualisiere den Zusammenhang zwischen Herzfrequenz und Schlafdauer (invertierte x-Achse)
# plt.figure(figsize=(10, 6))
# plt.scatter(combined_data['avg_hr'], combined_data['Durchschnittliche Dauer'], color='blue', label='Datenpunkte')
# plt.title('Zusammenhang zwischen Herzfrequenz (Durchschnitt) und Schlafdauer')
# plt.xlabel('Durchschnittliche Herzfrequenz (bpm)')
# plt.ylabel('Schlafdauer (Stunden)')
# plt.grid(True)
# m, b = np.polyfit(combined_data['avg_hr'], combined_data['Durchschnittliche Dauer'], 1)
# plt.plot(combined_data['avg_hr'], m * combined_data['avg_hr'] + b, color='red', label=f'Trendlinie (Kor = {correlation:.2f})')
# plt.gca().invert_xaxis()  # X-Achse invertieren
# plt.legend()
# plt.savefig(graphic_corr_path)
# plt.show()

# Schritt 5: Visualisiere den Zusammenhang zwischen Herzfrequenz und Schlafdauer (invertierte y-Achse)
plt.figure(figsize=(10, 6))
plt.scatter(combined_data['Durchschnittliche Dauer'], combined_data['avg_hr'], color='blue', label='Datenpunkte')
plt.title('Zusammenhang zwischen Schlafdauer und Herzfrequenz (Durchschnitt)')
plt.xlabel('Schlafdauer (Stunden)')
plt.ylabel('Durchschnittliche Herzfrequenz (bpm)')
plt.grid(True)

# Berechne und zeichne die Trendlinie (umgekehrt)
m, b = np.polyfit(combined_data['Durchschnittliche Dauer'], combined_data['avg_hr'], 1)
plt.plot(combined_data['Durchschnittliche Dauer'], m * combined_data['Durchschnittliche Dauer'] + b, color='red', label=f'Trendlinie (Kor = {correlation:.2f})')

plt.gca().invert_yaxis()  # Y-Achse invertieren, da die Herzfrequenz auf der Y-Achse ist
plt.legend()
plt.savefig(graphic_corr_path)
plt.show()


# Schritt 6: Erstelle eine Grafik pro Kalenderwoche (HR und Schlafdaten)
fig, ax1 = plt.subplots(figsize=(30, 8))  # Breitere Darstellung

# Erste Achse: Herzfrequenz
ax1.bar(combined_data['Woche'], combined_data['avg_hr'], width=0.4, label='Durchschnittliche Herzfrequenz', align='center', color='b')
ax1.set_xlabel('Kalenderwoche')
ax1.set_ylabel('Durchschnittliche Herzfrequenz (bpm)', color='b')
ax1.tick_params(axis='y', labelcolor='b')

# Zweite Achse: Schlafdauer
ax2 = ax1.twinx()
ax2.bar(combined_data['Woche'], combined_data['Durchschnittliche Dauer'], width=0.4, label='Schlafdauer', align='edge', color='g')
ax2.set_ylabel('Schlafdauer (Stunden)', color='g')
ax2.tick_params(axis='y', labelcolor='g')

plt.title('Durchschnittliche Herzfrequenz und Schlafdauer pro Kalenderwoche')

# Anpassung der x-Achse für bessere Lesbarkeit
plt.xticks(rotation=90, ha='center', fontsize=12)  # Schriftgröße auf 12 erhöht

# Zeige nur jede zweite Woche
ax1.set_xticks(ax1.get_xticks()[::2])

fig.tight_layout()

plt.savefig(graphic_weekly_path)
plt.show()