cds1011-ls2/py/model.py

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder

# low amounts of features will result in many zero devision in tp=0 and fp=0
np.seterr(divide='ignore', invalid='ignore')

FEATURES = ["points", "x", "y"]

# create dataframe with csv file
def make_dataframe(transform):
    def load_dataframe(file_path):
        try:
            colum_list = FEATURES
            df = pd.read_csv(file_path, usecols = colum_list).dropna()
            return transform(df)
        except FileNotFoundError as error:
            print(error)
            quit()
    return load_dataframe

# depending on mode, [x, y] cordinates are used as feature or length of vector (x, y) [radius] is used
def make_features(selector):
    def select(df):
        return df
    return select(selector)

# Feature radius when mode = v
def radius(df):
    df["radius"] = np.sqrt(df["x"]**2 + df["y"]**2)
    return df[["radius"]]

# Feature ["x", "y"] when mode = a or c
def xy(df):
    features = ["x", "y"]
    return df[features]

# apply model on dataframe. Params: df = dataframe, features = function make_features, inf = True or False, graph = True or False
def apply_model(df, features, score, inf, graph):

    # print dataframe information
    if inf:
        print(df.describe())
        print(df.head())
        print(df.head().info())

    # display graphs
    if graph:
        sns.countplot(x = df["points"])
        plt.show()

        sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
        plt.show()

        sns.scatterplot(x=df['x'], y=df['y'], hue=df['points'])
        plt.show()

    y = pd.get_dummies(df['points'])
    X = features(df) # select which features to use radius or xy

    # Split data into 60/40 (train/test)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)

    # Create a RandomForestClassifier with n_estimators=700
    random_forest = RandomForestClassifier(n_estimators=700, random_state=0)

    # Create a DecisionTreeClassifier
    decision_tree = DecisionTreeClassifier(random_state=0)

    # Create a KNeighborsClassifier with n_neighbors=5
    k_neighbors = KNeighborsClassifier(n_neighbors=5)

    models = {
        "Random Forest Classifier": random_forest,
        "Decision Tree Classifier": decision_tree,
        "K-Neighbors": k_neighbors
    }

    for name, model in models.items():
        model.fit(X_train.values, y_train.values)

    for name, model in models.items():
        pred = model.predict(X_test.values)

        # calculate f1 with own function
        my_f1_macro_score = calc_f1_macro(y_test, pd.DataFrame(pred))
        print(f'My F1 score of {name} is {my_f1_macro_score}')

        # calculate f1 with sklearn function
        f1_sklearn = f1_score(y_test.values, pred, average='macro')
        print(f'Sklearn F1 score of {name} is {f1_sklearn}')

    score = score() # promt for x, y coordinates and transform score based on mode

    label_encoder = LabelEncoder()
    df["points"] = label_encoder.fit_transform(df["points"])

    for name, model in models.items():
        pred = model.predict(score)
        points_number = pd.DataFrame(pred).idxmax(axis=1)
        points = label_encoder.inverse_transform(points_number)[0]
        print(f"{name}: {points} Punkte")

    input("\nPress any key to continue...\n")

# calc f1 macro
def calc_f1_macro(y_true, y_pred):
    f1_scores = []
    for column in y_true:
        score = calc_f1_score(y_true[column].values, y_pred[column])
        f1_scores.append(score)
    return np.mean(f1_scores)

# calc f1 score
def calc_f1_score(y_true, y_pred):
    tp = np.sum(np.multiply([i==True for i in y_pred], y_true))
    tn = np.sum(np.multiply([i==False for i in y_pred], [not(j) for j in y_true]))
    fp = np.sum(np.multiply([i==True for i in y_pred], [not(j) for j in y_true]))
    fn = np.sum(np.multiply([i==False for i in y_pred], y_true))

    precision = calc_precision(tp, fp)
    recall = calc_recall(tp, fn)

    if precision != 0 and recall != 0:
        f1 = (2 * precision * recall) / (precision + recall)
    else:
        f1 = 0
    return f1

# calc precision
def calc_precision(tp, fp):
    return tp / (tp + fp)

# calc recall
def calc_recall(tp, fn):
    return tp / (tp + fn)

# ask for x, y value and return transformed array based on mode
def make_score_function(transform):
    def get_score_from_cli():
        try:
            x = float(input("x: "))
            y = float(input("y: "))
            return np.array([transform(x, y)]).reshape(1, -1)
        except ValueError:
            print("Invalid input. Please enter numeric values.")
            return None
    return get_score_from_cli