import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder

FEATURES = ['Flipper Length (mm)','Body Mass (g)','Culmen Depth (mm)','Culmen Length (mm)', 'Species']

def load_dataframe():
    try:        
        column_list = FEATURES
        df = pd.read_csv("penguins.csv", usecols=column_list)
        return df
    except FileNotFoundError:
        print("Datei 'penguins.csv' nicht gefunden.")
        return None

def calc_precision(tp, fp):
    print("🛠️ under construction")

def calc_recall(tp, fn):
    print("🛠️ under construction")

def calc_f1_score(y_true, y_pred):
    #https://stackoverflow.com/questions/64860091/computing-macro-average-f1-score-using-numpy-pythonwithout-using-scikit-learn
    tp = np.sum(np.multiply([i==True for i in y_pred], y_true))
    tn = np.sum(np.multiply([i==False for i in y_pred], [not(j) for j in y_true]))
    fp = np.sum(np.multiply([i==True for i in y_pred], [not(j) for j in y_true]))
    fn = np.sum(np.multiply([i==False for i in y_pred], y_true))
    print("🛠️ under construction")


def calc_f1_macro(y_true, y_pred):
    f1_scores = []
    for column in y_true:
        score = calc_f1_score(y_true[column].values, y_pred[column])
        f1_scores.append(score)
    return np.mean(f1_scores)

def get_penguin_from_cli():
    try:
        culmen_depth = float(input("Culmen Depth (mm): "))
        culmen_length = float(input("Culmen Length (mm): "))
        return np.array([culmen_depth, culmen_length]).reshape(1, -1)
    except ValueError:
        print("Invalid input. Please enter numeric values.")
        return None

def main():
    df = load_dataframe()
    if df is None:
        return

    print("\n=== Overview ===")
    print(df.describe())
    print(df.head())
    print(df.head().info())

    print("\n=== Quality Assessment ===")
    row_count =  len(df)
    print("Number of rows ", row_count)
    print("Check for null-values ", df.isnull().sum())
    
    print("\n=== Preprocessing ===")
    # fill null-values with mean
    df.fillna(df.mean(numeric_only=True), inplace=True)

    # transform species column to numbers
    label_encoder = LabelEncoder()
    df["Species"] = label_encoder.fit_transform(df["Species"])

    print("\n=== Countplot ===")
    # Countplot check for the balancing of the data
    sns.countplot(x=df["Species"])
    plt.show()

    print("\n=== Heatmap ===")
    # Check correlation among other variables
    sns.heatmap(df.corr(), annot=True, cmap="coolwarm")
    plt.show()

    print("\n=== Feature Selection ===")
    features = ['Culmen Depth (mm)','Culmen Length (mm)']
    y = df["Species"]
    X = df[features]
    y = pd.get_dummies(y)

    print("\n=== Visualize Features ===")
    sns.scatterplot(x=df['Culmen Length (mm)'], y=df['Culmen Depth (mm)'], hue=df['Species'])
    plt.show()

    print("\n=== Model Training ===")
    print("🛠️ under construction")

    print("\n=== Model Evaluation ===")        
    print("🛠️ under construction")

    print("\n=== Prediction ===")
    # Culmen Depth (mm) = 18, Culmen Length (mm) = 50
    #wild_penguin = np.array([18, 50]).reshape(1, -1)
    #wild_penguin = get_penguin_from_cli()
    print("🛠️ under construction")

if __name__ == "__main__":
    main()