import pandas as pd import numpy as np import seaborn as sns import matplotlib.pyplot as plt from sklearn.metrics import f1_score from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.preprocessing import LabelEncoder FEATURES = ['Flipper Length (mm)','Body Mass (g)','Culmen Depth (mm)','Culmen Length (mm)', 'Species'] def load_dataframe(): try: column_list = FEATURES df = pd.read_csv("penguins.csv", usecols = column_list) return df except FileNotFoundError: print("Datei 'penguins.csv' nicht gefunden.") return None def calc_precision(tp, fp): return tp / (tp + fp) def calc_recall(tp, fn): return tp / (tp + fn) def calc_f1_score(y_true, y_pred): #https://stackoverflow.com/questions/64860091/computing-macro-average-f1-score-using-numpy-pythonwithout-using-scikit-learn tp = np.sum(np.multiply([i==True for i in y_pred], y_true)) tn = np.sum(np.multiply([i==False for i in y_pred], [not(j) for j in y_true])) fp = np.sum(np.multiply([i==True for i in y_pred], [not(j) for j in y_true])) fn = np.sum(np.multiply([i==False for i in y_pred], y_true)) precision = calc_precision(tp, fp) recall = calc_recall(tp, fn) if precision != 0 and recall != 0: f1 = (2 * precision * recall) / (precision + recall) else: f1 = 0 return f1 def calc_f1_macro(y_true, y_pred): f1_scores = [] for column in y_true: score = calc_f1_score(y_true[column].values, y_pred[column]) f1_scores.append(score) return np.mean(f1_scores) def get_penguin_from_cli(): try: culmen_depth = float(input("Culmen Depth (mm): ")) culmen_length = float(input("Culmen Length (mm): ")) return np.array([culmen_depth, culmen_length]).reshape(1, -1) except ValueError: print("Invalid input. Please enter numeric values.") return None def main(): df = load_dataframe() if df is None: return print("\n=== Overview ===") print(df.describe()) print(df.head()) print(df.head().info()) print("\n=== Quality Assessment ===") row_count = len(df) print("Number of rows:", row_count) # check min, max, mean ... # See df.describe() above print("Check for null-values:", df.isnull().sum()) print("\n=== Preprocessing ===") # fill null-values with mean df.fillna(df.mean(numeric_only=True), inplace=True) # transform species column to numbers labelencoder = LabelEncoder() df['Species'] = labelencoder.fit_transform(df['Species']) print("\n=== Countplot ===") # Countplot check for the balancing of the data sns.countplot(x = df['Species']) plt.show() print("\n=== Heatmap ===") # Check correlation among other variables sns.heatmap(df.corr(), annot=True, cmap='coolwarm') plt.show() print("\n=== Feature Selection ===") features = ['Culmen Depth (mm)', 'Culmen Length (mm)'] y = df['Species'] X = df[features] y = pd.get_dummies(y) print("\n=== Visualize Features ===") sns.scatterplot(x=df['Culmen Length (mm)'], y=df['Culmen Depth (mm)'], hue=df['Species']) plt.show() print("\n=== Model Training ===") # Split data into 60/40 (train/test) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0) # Create a RandomForestClassifier with n_estimators=700 random_forest = RandomForestClassifier(n_estimators=700, random_state=0) # Create a DecisionTreeClassifier decision_tree = DecisionTreeClassifier(random_state=0) # Create a KNeighborsClassifier with n_neighbors=5 k_neighbors = KNeighborsClassifier(n_neighbors=5) models = { "Random Forest Classifier": random_forest, "Decision Tree Classifier": decision_tree, "K-Neighbors": k_neighbors } for name, model in models.items(): model.fit(X_train.values, y_train.values) print("\n=== Model Evaluation ===") for name, model in models.items(): pred = model.predict(X_test.values) # Hint: calc_f1_macro expects "pred" to be a DataFrame --> pd.DataFrame(pred) my_f1_macro_score = calc_f1_macro(y_test, pd.DataFrame(pred)) print(f'My F1 score of {name} is {my_f1_macro_score}') f1_sklearn = f1_score(y_test.values, pred, average='macro') print(f'Sklearn F1 score of {name} is {f1_sklearn}') print("\n=== Prediction ===") # Culmen Depth (mm) = 18, Culmen Length (mm) = 50 #wild_penguin = np.array([18, 50]).reshape(1, -1) wild_penguin = get_penguin_from_cli() for name, model in models.items(): pred = model.predict(wild_penguin) species_number = pd.DataFrame(pred).idxmax(axis=1) species = labelencoder.inverse_transform(species_number)[0] print(f'{name}: Dieser Pinguin gehört der Spezies "{species}" an') if __name__ == "__main__": main()