cds1011-ls2/main.py

import sys
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from py.arguments import Arguments

if not sys.argv[1:]:
    print("Usage: python3 main.py <path to csv>")
    sys.exit(1)

FEATURES = ["points", "x", "y"]

# create dataframe from csv and drop any row with null values
def load_dataframe(file_path):
    try:
        colum_list = FEATURES
        df = pd.read_csv(file_path, usecols = colum_list).dropna()
        return df
    except FileNotFoundError as error:
        print(error)
        quit()

def calc_f1_macro(y_true, y_pred):
    f1_scores = []
    for column in y_true:
        score = calc_f1_score(y_true[column].values, y_pred[column])
        f1_scores.append(score)
    return np.mean(f1_scores)

def calc_f1_score(y_true, y_pred):
    tp = np.sum(np.multiply([i==True for i in y_pred], y_true))
    tn = np.sum(np.multiply([i==False for i in y_pred], [not(j) for j in y_true]))
    fp = np.sum(np.multiply([i==True for i in y_pred], [not(j) for j in y_true]))
    fn = np.sum(np.multiply([i==False for i in y_pred], y_true))

    precision = calc_precision(tp, fp)
    recall = calc_recall(tp, fn)

    '''
    if tp != 0 and fp != 0:
        precision = calc_precision(tp, fp)
    else:
        precision = 0

    if tp != 0 and fn != 0:
        recall = calc_recall(tp, fn)
    else:
        recall = 0
    '''

    if precision != 0 and recall != 0:
        f1 = (2 * precision * recall) / (precision + recall)
    else:
        f1 = 0
    return f1

def calc_precision(tp, fp):
    return tp / (tp + fp)

def calc_recall(tp, fn):
    return tp / (tp + fn)

def get_score_from_cli(mode):
    try:
        x = float(input("x: "))
        y = float(input("y: "))
        if mode == "v":
            abs_v = np.sqrt(x**2 + y**2)
            return np.array([abs_v]).reshape(1, -1)
        return np.array([x, y]).reshape(1, -1)
    except ValueError:
        print("Invalid input. Please enter numeric values.")
        return None

def main():

    repeat = True
    args = Arguments(sys.argv[1], "v", False, False)
    args.set_mode("v")
    args.set_information(False)
    args.set_graph(False)

    settings = {
        "repeat": True,
        "file": args.get_file_path(),
        "mode": args.get_mode(),
        "information": args.get_information(),
        "graph": args.get_graph()
    }

    while repeat:
        print("Currently selected setting:")
        print(f"File: {settings["file"]}")
        print(f"Mode: {settings["mode"]}")
        print(f"Display information: {settings["information"]}")
        print(f"Display graphs: {settings["graph"]}")

        prompt = input("Change settings [y / exit]: ")

        if prompt == "y":
            args.set_file_path(input("Change file <path to file>: "))
            args.set_mode(input("Change mode [v, a, c]: "))
            args.set_information(bool(input("Display information [True / False]: ")))
            args.set_graph(bool(input("Display graphs [True / False]: ")))
        elif prompt == "exit":
            quit()

        # load dataframe with argument [1]
        df = load_dataframe(args.get_file_path())

        # print dataframe information if argument [3] is true
        if args.get_information():
            print(df.describe())
            print(df.head())
            print(df.head().info())

        # display graphs if argument [4] is true
        if args.get_graph():

            sns.countplot(x = df["points"])
            plt.show()

            sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
            plt.show()

            sns.scatterplot(x=df['x'], y=df['y'], hue=df['points'])
            plt.show()

        # use verctor length of (x,y) as feature
        if args.get_mode() == "v":
            df["radius"] = np.sqrt(df["x"]**2 + df["y"]**2)
            X = df[["radius"]]
        # use absoult values of (x,y) as feature
        elif args.get_mode() == "a":
            df_abs = df.copy().abs()
            features = ["x", "y"]
            X = df[features]
        # use unaltered values of (x,y) as feature
        elif args.get_mode() == "c":
            features = ["x", "y"]
            X = df[features]

        y = pd.get_dummies(df['points'])

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)

        random_forest = RandomForestClassifier(n_estimators=700, random_state=0)
        decision_tree = DecisionTreeClassifier(random_state=0)
        k_neighbors = KNeighborsClassifier(n_neighbors=5)

        models = {
            "Random Forest Classifier": random_forest,
            "Decision Tree Classifier": decision_tree,
            "K-Neighbors": k_neighbors
        }

        for name, model in models.items():
            model.fit(X_train.values, y_train.values)

        for name, model in models.items():
            pred = model.predict(X_test.values)

            my_f1_macro_score = calc_f1_macro(y_test, pd.DataFrame(pred))
            print(f'My F1 score of {name} is {my_f1_macro_score}\n')

            f1_sklearn = f1_score(y_test.values, pred, average='macro')
            print(f'Sklearn F1 score of {name} is {f1_sklearn}\n')

        score = get_score_from_cli(args.get_mode)

        label_encoder = LabelEncoder()
        df["points"] = label_encoder.fit_transform(df["points"])

        for name, model in models.items():
            pred = model.predict(score)
            points_number = pd.DataFrame(pred).idxmax(axis=1)
            points = label_encoder.inverse_transform(points_number)[0]
            print(f"{name}: {points} Punkte")


if __name__ == "__main__":
    main()