cds1011-ls2/main.py
2025-11-29 16:16:16 +01:00

147 lines
4.2 KiB
Python

import sys
from arguments import Arguments
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
if not sys.argv[1:]:
print("Usage: python3 main.py <path to csv> <mode vector [v] (default) or absolut [a] or cartesian [c]> <optional information [true]> <optional graphs [true]>")
sys.exit(1)
args = Arguments(sys.argv[1])
args.set_mode(sys.argv[2])
try:
args.set_information(sys.argv[3])
args.set_graph(sys.argv[4])
except IndexError:
args.set_information(False)
args.set_graph(False)
FEATURES = ["points", "x", "y"]
# create dataframe from csv and drop any row with null values
def load_dataframe(file_path):
try:
colum_list = FEATURES
df = pd.read_csv(file_path, usecols = colum_list).dropna()
return df
except FileNotFoundError as error:
print(error)
quit()
def calc_f1_macro(y_true, y_pred):
f1_scores = []
for column in y_true:
score = calc_f1_score(y_true[column].values, y_pred[column])
f1_scores.append(score)
return np.mean(f1_scores)
def calc_f1_score(y_true, y_pred):
tp = np.sum(np.multiply([i==True for i in y_pred], y_true))
tn = np.sum(np.multiply([i==False for i in y_pred], [not(j) for j in y_true]))
fp = np.sum(np.multiply([i==True for i in y_pred], [not(j) for j in y_true]))
fn = np.sum(np.multiply([i==False for i in y_pred], y_true))
'''print(tp)
print(fp)
precision = calc_precision(tp, fp)
recall = calc_recall(tp, fn)'''
if tp != 0 and fp != 0:
precision = calc_precision(tp, fp)
else:
precision = 0
if tp != 0 and fn != 0:
recall = calc_recall(tp, fn)
else:
recall = 0
if precision != 0 and recall != 0:
f1 = (2 * precision * recall) / (precision + recall)
else:
f1 = 0
return f1
def calc_precision(tp, fp):
return tp / (tp + fp)
def calc_recall(tp, fn):
return tp / (tp + fn)
def get_score_from_cli():
try:
x = float(input("x: "))
y = float(input("y: "))
return np.array([x, y]).reshape(1, -1)
except ValueError:
print("Invalid input. Please enter numeric values.")
return None
def main():
df = load_dataframe(args.get_file_path())
print(df.describe())
#print(df.head())
#print(df.head().info())
sns.countplot(x = df["points"])
plt.show()
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.show()
sns.scatterplot(x=df['x'], y=df['y'], hue=df['points'])
plt.show()
features = ["x", "y"]
X = df[features]
y = pd.get_dummies(df['points'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)
random_forest = RandomForestClassifier(n_estimators=700, random_state=0)
decision_tree = DecisionTreeClassifier(random_state=0)
k_neighbors = KNeighborsClassifier(n_neighbors=5)
models = {
"Random Forest Classifier": random_forest,
"Decision Tree Classifier": decision_tree,
"K-Neighbors": k_neighbors
}
for name, model in models.items():
model.fit(X_train.values, y_train.values)
for name, model in models.items():
pred = model.predict(X_test.values)
my_f1_macro_score = calc_f1_macro(y_test, pd.DataFrame(pred))
print(f'My F1 score of {name} is {my_f1_macro_score}\n')
f1_sklearn = f1_score(y_test.values, pred, average='macro')
print(f'Sklearn F1 score of {name} is {f1_sklearn}\n')
score = get_score_from_cli()
label_encoder = LabelEncoder()
df["points"] = label_encoder.fit_transform(df["points"])
for name, model in models.items():
pred = model.predict(score)
points_number = pd.DataFrame(pred).idxmax(axis=1)
points = label_encoder.inverse_transform(points_number)[0]
print(f"{name}: {points} Punkte")
if __name__ == "__main__":
main()