diff --git a/classification/classification.py b/classification/classification.py new file mode 100644 index 0000000..1ceb64d --- /dev/null +++ b/classification/classification.py @@ -0,0 +1,153 @@ +import pandas as pd +import numpy as np +import seaborn as sns +import matplotlib.pyplot as plt +from sklearn.metrics import f1_score +from sklearn.model_selection import train_test_split +from sklearn.ensemble import RandomForestClassifier +from sklearn.neighbors import KNeighborsClassifier +from sklearn.tree import DecisionTreeClassifier +from sklearn.preprocessing import LabelEncoder + +FEATURES = ['Flipper Length (mm)','Body Mass (g)','Culmen Depth (mm)','Culmen Length (mm)', 'Species'] + +def load_dataframe(): + try: + column_list = FEATURES + df = pd.read_csv("penguins.csv", usecols = column_list) + return df + except FileNotFoundError: + print("Datei 'penguins.csv' nicht gefunden.") + return None + +def calc_precision(tp, fp): + return tp / (tp + fp) + +def calc_recall(tp, fn): + return tp / (tp + fn) + +def calc_f1_score(y_true, y_pred): + #https://stackoverflow.com/questions/64860091/computing-macro-average-f1-score-using-numpy-pythonwithout-using-scikit-learn + tp = np.sum(np.multiply([i==True for i in y_pred], y_true)) + tn = np.sum(np.multiply([i==False for i in y_pred], [not(j) for j in y_true])) + fp = np.sum(np.multiply([i==True for i in y_pred], [not(j) for j in y_true])) + fn = np.sum(np.multiply([i==False for i in y_pred], y_true)) + precision = calc_precision(tp, fp) + recall = calc_recall(tp, fn) + + if precision != 0 and recall != 0: + f1 = (2 * precision * recall) / (precision + recall) + else: + f1 = 0 + return f1 + + +def calc_f1_macro(y_true, y_pred): + f1_scores = [] + for column in y_true: + score = calc_f1_score(y_true[column].values, y_pred[column]) + f1_scores.append(score) + return np.mean(f1_scores) + +def get_penguin_from_cli(): + try: + culmen_depth = float(input("Culmen Depth (mm): ")) + culmen_length = float(input("Culmen Length (mm): ")) + return np.array([culmen_depth, culmen_length]).reshape(1, -1) + except ValueError: + print("Invalid input. Please enter numeric values.") + return None + +def main(): + df = load_dataframe() + if df is None: + return + + print("\n=== Overview ===") + print(df.describe()) + print(df.head()) + print(df.head().info()) + + print("\n=== Quality Assessment ===") + row_count = len(df) + print("Number of rows:", row_count) + + # check min, max, mean ... + # See df.describe() above + + print("Check for null-values:", df.isnull().sum()) + + print("\n=== Preprocessing ===") + # fill null-values with mean + df.fillna(df.mean(numeric_only=True), inplace=True) + + # transform species column to numbers + labelencoder = LabelEncoder() + df['Species'] = labelencoder.fit_transform(df['Species']) + + print("\n=== Countplot ===") + # Countplot check for the balancing of the data + sns.countplot(x = df['Species']) + plt.show() + + print("\n=== Heatmap ===") + # Check correlation among other variables + sns.heatmap(df.corr(), annot=True, cmap='coolwarm') + plt.show() + + print("\n=== Feature Selection ===") + features = ['Culmen Depth (mm)', 'Culmen Length (mm)'] + + y = df['Species'] + X = df[features] + y = pd.get_dummies(y) + + print("\n=== Visualize Features ===") + sns.scatterplot(x=df['Culmen Length (mm)'], y=df['Culmen Depth (mm)'], hue=df['Species']) + plt.show() + + print("\n=== Model Training ===") + # Split data into 60/40 (train/test) + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0) + # Create a RandomForestClassifier with n_estimators=700 + random_forest = RandomForestClassifier(n_estimators=700, random_state=0) + + # Create a DecisionTreeClassifier + decision_tree = DecisionTreeClassifier(random_state=0) + + # Create a KNeighborsClassifier with n_neighbors=5 + k_neighbors = KNeighborsClassifier(n_neighbors=5) + + models = { + "Random Forest Classifier": random_forest, + "Decision Tree Classifier": decision_tree, + "K-Neighbors": k_neighbors + } + + for name, model in models.items(): + model.fit(X_train.values, y_train.values) + + print("\n=== Model Evaluation ===") + for name, model in models.items(): + pred = model.predict(X_test.values) + + # Hint: calc_f1_macro expects "pred" to be a DataFrame --> pd.DataFrame(pred) + my_f1_macro_score = calc_f1_macro(y_test, pd.DataFrame(pred)) + print(f'My F1 score of {name} is {my_f1_macro_score}') + + f1_sklearn = f1_score(y_test.values, pred, average='macro') + print(f'Sklearn F1 score of {name} is {f1_sklearn}') + + print("\n=== Prediction ===") + # Culmen Depth (mm) = 18, Culmen Length (mm) = 50 + #wild_penguin = np.array([18, 50]).reshape(1, -1) + wild_penguin = get_penguin_from_cli() + + for name, model in models.items(): + pred = model.predict(wild_penguin) + species_number = pd.DataFrame(pred).idxmax(axis=1) + species = labelencoder.inverse_transform(species_number)[0] + print(f'{name}: Dieser Pinguin gehört der Spezies "{species}" an') + +if __name__ == "__main__": + main() diff --git a/classification/classification_template.py b/classification/classification_template.py index cf6e233..bfe0cca 100644 --- a/classification/classification_template.py +++ b/classification/classification_template.py @@ -12,19 +12,19 @@ from sklearn.preprocessing import LabelEncoder FEATURES = ['Flipper Length (mm)','Body Mass (g)','Culmen Depth (mm)','Culmen Length (mm)', 'Species'] def load_dataframe(): - try: + try: column_list = FEATURES - df = pd.read_csv("penguins.csv", usecols=column_list) + df = pd.read_csv("penguins.csv", usecols = column_list) return df except FileNotFoundError: print("Datei 'penguins.csv' nicht gefunden.") return None def calc_precision(tp, fp): - print("🛠️ under construction") + return tp / (tp + fp) def calc_recall(tp, fn): - print("🛠️ under construction") + return tp / (tp + fn) def calc_f1_score(y_true, y_pred): #https://stackoverflow.com/questions/64860091/computing-macro-average-f1-score-using-numpy-pythonwithout-using-scikit-learn @@ -32,7 +32,13 @@ def calc_f1_score(y_true, y_pred): tn = np.sum(np.multiply([i==False for i in y_pred], [not(j) for j in y_true])) fp = np.sum(np.multiply([i==True for i in y_pred], [not(j) for j in y_true])) fn = np.sum(np.multiply([i==False for i in y_pred], y_true)) - print("🛠️ under construction") + precision = calc_precision(tp, fp) + recall = calc_recall(tp, fn) + if precision != 0 and recall != 0: + f1 = (2 * precision * recall) / (precision + recall) + else: + f1 = 0 + return f1 def calc_f1_macro(y_true, y_pred): @@ -62,9 +68,9 @@ def main(): print(df.head().info()) print("\n=== Quality Assessment ===") - row_count = len(df) - print("Number of rows ", row_count) - print("Check for null-values ", df.isnull().sum()) + row_count = len(df) + print("Number of rows:", row_count) + print("Check for null-values:", df.isnull().sum()) print("\n=== Preprocessing ===") # fill null-values with mean @@ -76,7 +82,7 @@ def main(): print("\n=== Countplot ===") # Countplot check for the balancing of the data - sns.countplot(x=df["Species"]) + sns.countplot(x = df["Species"]) plt.show() print("\n=== Heatmap ===") @@ -85,7 +91,8 @@ def main(): plt.show() print("\n=== Feature Selection ===") - features = ['Culmen Depth (mm)','Culmen Length (mm)'] + features = ['Culmen Depth (mm)', 'Culmen Length (mm)'] + y = df["Species"] X = df[features] y = pd.get_dummies(y) @@ -95,16 +102,40 @@ def main(): plt.show() print("\n=== Model Training ===") - print("🛠️ under construction") + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0) + random_forest = RandomForestClassifier(n_estimators=700, random_state=0) + decision_tree = DecisionTreeClassifier(random_state=0) + k_neighbors = KNeighborsClassifier(n_neighbors=5) + + models = { + "Random Forest Classifier": random_forest, + "Decision Tree Classifier": decision_tree, + "K-Neighbors": k_neighbors + } + + for name, model in models.items(): + model.fit(X_train.values, y_train.values) print("\n=== Model Evaluation ===") - print("🛠️ under construction") + for name, model in models.items(): + pred = model.predict(X_test.values) + + my_f1_macro_score = calc_f1_macro(y_test, pd.DataFrame(pred)) + print(f"My F1 score of {name} is {my_f1_macro_score}") + + f1_sklearn = f1_score(y_test.values, pred, average="macro") + print(f"Sklearn F1 score of {name} is {f1_sklearn}") print("\n=== Prediction ===") # Culmen Depth (mm) = 18, Culmen Length (mm) = 50 #wild_penguin = np.array([18, 50]).reshape(1, -1) - #wild_penguin = get_penguin_from_cli() - print("🛠️ under construction") + wild_penguin = get_penguin_from_cli() + + for name, model in models.items(): + pred = model.predict(wild_penguin) + species_number = pd.DataFrame(pred).idxmax(axis=1) + species = label_encoder.inverse_transform(species_number)[0] + print(f"{name}: Dieser Pinguin gehört der Spezies '{species}' an") if __name__ == "__main__": main()