added classification/classification.py solution to classification_template.py

2025-11-25 21:48:49 +01:00 · 2025-11-25 21:48:49 +01:00 · 93c6362445
commit 93c6362445
parent a311e2b3a9
2 changed files with 198 additions and 14 deletions
--- a/classification/classification.py
+++ b/classification/classification.py
@ -0,0 +1,153 @@
 import pandas as pd
 import numpy as np
 import seaborn as sns
 import matplotlib.pyplot as plt
 from sklearn.metrics import f1_score
 from sklearn.model_selection import train_test_split
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.neighbors import KNeighborsClassifier
 from sklearn.tree import DecisionTreeClassifier
 from sklearn.preprocessing import LabelEncoder
 FEATURES = ['Flipper Length (mm)','Body Mass (g)','Culmen Depth (mm)','Culmen Length (mm)', 'Species']
 def load_dataframe():
    try:
        column_list = FEATURES
        df = pd.read_csv("penguins.csv", usecols = column_list)
        return df
    except FileNotFoundError:
        print("Datei 'penguins.csv' nicht gefunden.")
        return None
 def calc_precision(tp, fp):
    return tp / (tp + fp)
 def calc_recall(tp, fn):
    return tp / (tp + fn)
 def calc_f1_score(y_true, y_pred):
    #https://stackoverflow.com/questions/64860091/computing-macro-average-f1-score-using-numpy-pythonwithout-using-scikit-learn
    tp = np.sum(np.multiply([i==True for i in y_pred], y_true))
    tn = np.sum(np.multiply([i==False for i in y_pred], [not(j) for j in y_true]))
    fp = np.sum(np.multiply([i==True for i in y_pred], [not(j) for j in y_true]))
    fn = np.sum(np.multiply([i==False for i in y_pred], y_true))
    precision = calc_precision(tp, fp)
    recall = calc_recall(tp, fn)
    if precision != 0 and recall != 0:
        f1 = (2 * precision * recall) / (precision + recall)
    else:
        f1 = 0
    return f1
 def calc_f1_macro(y_true, y_pred):
    f1_scores = []
    for column in y_true:
        score = calc_f1_score(y_true[column].values, y_pred[column])
        f1_scores.append(score)
    return np.mean(f1_scores)
 def get_penguin_from_cli():
    try:
        culmen_depth = float(input("Culmen Depth (mm): "))
        culmen_length = float(input("Culmen Length (mm): "))
        return np.array([culmen_depth, culmen_length]).reshape(1, -1)
    except ValueError:
        print("Invalid input. Please enter numeric values.")
        return None
 def main():
    df = load_dataframe()
    if df is None:
        return
    print("\n=== Overview ===")
    print(df.describe())
    print(df.head())
    print(df.head().info())
    print("\n=== Quality Assessment ===")
    row_count = len(df)
    print("Number of rows:", row_count)
    # check min, max, mean ...
    # See df.describe() above
    print("Check for null-values:", df.isnull().sum())
    print("\n=== Preprocessing ===")
    # fill null-values with mean
    df.fillna(df.mean(numeric_only=True), inplace=True)
    # transform species column to numbers
    labelencoder = LabelEncoder()
    df['Species'] = labelencoder.fit_transform(df['Species'])
    print("\n=== Countplot ===")
    # Countplot check for the balancing of the data
    sns.countplot(x = df['Species'])
    plt.show()
    print("\n=== Heatmap ===")
    # Check correlation among other variables
    sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
    plt.show()
    print("\n=== Feature Selection ===")
    features = ['Culmen Depth (mm)', 'Culmen Length (mm)']
    y = df['Species']
    X = df[features]
    y = pd.get_dummies(y)
    print("\n=== Visualize Features ===")
    sns.scatterplot(x=df['Culmen Length (mm)'], y=df['Culmen Depth (mm)'], hue=df['Species'])
    plt.show()
    print("\n=== Model Training ===")
    # Split data into 60/40 (train/test)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)
    # Create a RandomForestClassifier with n_estimators=700
    random_forest = RandomForestClassifier(n_estimators=700, random_state=0)
    # Create a DecisionTreeClassifier
    decision_tree = DecisionTreeClassifier(random_state=0)
    # Create a KNeighborsClassifier with n_neighbors=5
    k_neighbors = KNeighborsClassifier(n_neighbors=5)
    models = {
                "Random Forest Classifier": random_forest,
                "Decision Tree Classifier": decision_tree,
                "K-Neighbors": k_neighbors
             }
    for name, model in models.items():
        model.fit(X_train.values, y_train.values)
    print("\n=== Model Evaluation ===")        
    for name, model in models.items():
        pred = model.predict(X_test.values)
        # Hint: calc_f1_macro expects "pred" to be a DataFrame --> pd.DataFrame(pred) 
        my_f1_macro_score = calc_f1_macro(y_test, pd.DataFrame(pred))
        print(f'My F1 score of {name} is {my_f1_macro_score}')
        f1_sklearn = f1_score(y_test.values, pred, average='macro')
        print(f'Sklearn F1 score of {name} is {f1_sklearn}')
    print("\n=== Prediction ===")
    # Culmen Depth (mm) = 18, Culmen Length (mm) = 50
    #wild_penguin = np.array([18, 50]).reshape(1, -1)
    wild_penguin = get_penguin_from_cli()
    for name, model in models.items():
        pred = model.predict(wild_penguin)
        species_number = pd.DataFrame(pred).idxmax(axis=1)
        species = labelencoder.inverse_transform(species_number)[0]
        print(f'{name}: Dieser Pinguin gehört der Spezies "{species}" an')      
 if __name__ == "__main__":
    main()
--- a/classification/classification_template.py
+++ b/classification/classification_template.py
@ -12,19 +12,19 @@ from sklearn.preprocessing import LabelEncoder
 FEATURES = ['Flipper Length (mm)','Body Mass (g)','Culmen Depth (mm)','Culmen Length (mm)', 'Species']
 def load_dataframe():
-    try:        
+    try:
        column_list = FEATURES
-        df = pd.read_csv("penguins.csv", usecols=column_list)
+        df = pd.read_csv("penguins.csv", usecols = column_list)
        return df
    except FileNotFoundError:
        print("Datei 'penguins.csv' nicht gefunden.")
        return None
 def calc_precision(tp, fp):
-    print("🛠️ under construction")
+    return tp / (tp + fp)
 def calc_recall(tp, fn):
-    print("🛠️ under construction")
+    return tp / (tp + fn)
 def calc_f1_score(y_true, y_pred):
    #https://stackoverflow.com/questions/64860091/computing-macro-average-f1-score-using-numpy-pythonwithout-using-scikit-learn
@ -32,7 +32,13 @@ def calc_f1_score(y_true, y_pred):
    tn = np.sum(np.multiply([i==False for i in y_pred], [not(j) for j in y_true]))
    fp = np.sum(np.multiply([i==True for i in y_pred], [not(j) for j in y_true]))
    fn = np.sum(np.multiply([i==False for i in y_pred], y_true))
-    print("🛠️ under construction")
+    precision = calc_precision(tp, fp)
    recall = calc_recall(tp, fn)
    if precision != 0 and recall != 0:
        f1 = (2 * precision * recall) / (precision + recall)
    else:
        f1 = 0
    return f1
 def calc_f1_macro(y_true, y_pred):
@ -62,9 +68,9 @@ def main():
    print(df.head().info())
    print("\n=== Quality Assessment ===")
-    row_count =  len(df)
+    row_count = len(df)
-    print("Number of rows ", row_count)
+    print("Number of rows:", row_count)
-    print("Check for null-values ", df.isnull().sum())
+    print("Check for null-values:", df.isnull().sum())
    print("\n=== Preprocessing ===")
    # fill null-values with mean
@ -76,7 +82,7 @@ def main():
    print("\n=== Countplot ===")
    # Countplot check for the balancing of the data
-    sns.countplot(x=df["Species"])
+    sns.countplot(x = df["Species"])
    plt.show()
    print("\n=== Heatmap ===")
@ -85,7 +91,8 @@ def main():
    plt.show()
    print("\n=== Feature Selection ===")
-    features = ['Culmen Depth (mm)','Culmen Length (mm)']
+    features = ['Culmen Depth (mm)', 'Culmen Length (mm)']
    y = df["Species"]
    X = df[features]
    y = pd.get_dummies(y)
@ -95,16 +102,40 @@ def main():
    plt.show()
    print("\n=== Model Training ===")
-    print("🛠️ under construction")
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)
    random_forest = RandomForestClassifier(n_estimators=700, random_state=0)
    decision_tree = DecisionTreeClassifier(random_state=0)
    k_neighbors = KNeighborsClassifier(n_neighbors=5)
    models = {
        "Random Forest Classifier": random_forest,
        "Decision Tree Classifier": decision_tree,
        "K-Neighbors": k_neighbors
    }
    for name, model in models.items():
        model.fit(X_train.values, y_train.values)
    print("\n=== Model Evaluation ===")        
-    print("🛠️ under construction")
+    for name, model in models.items():
        pred = model.predict(X_test.values)
        my_f1_macro_score = calc_f1_macro(y_test, pd.DataFrame(pred))
        print(f"My F1 score of {name} is {my_f1_macro_score}")
        f1_sklearn = f1_score(y_test.values, pred, average="macro")
        print(f"Sklearn F1 score of {name} is {f1_sklearn}")
    print("\n=== Prediction ===")
    # Culmen Depth (mm) = 18, Culmen Length (mm) = 50
    #wild_penguin = np.array([18, 50]).reshape(1, -1)
-    #wild_penguin = get_penguin_from_cli()
+    wild_penguin = get_penguin_from_cli()
-    print("🛠️ under construction")
+
    for name, model in models.items():
        pred = model.predict(wild_penguin)
        species_number = pd.DataFrame(pred).idxmax(axis=1)
        species = label_encoder.inverse_transform(species_number)[0]
        print(f"{name}: Dieser Pinguin gehört der Spezies '{species}' an")
 if __name__ == "__main__":
    main()