added classification/classification.py solution to classification_template.py
This commit is contained in:
parent
a311e2b3a9
commit
93c6362445
153
classification/classification.py
Normal file
153
classification/classification.py
Normal file
@ -0,0 +1,153 @@
|
|||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
import seaborn as sns
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
from sklearn.metrics import f1_score
|
||||||
|
from sklearn.model_selection import train_test_split
|
||||||
|
from sklearn.ensemble import RandomForestClassifier
|
||||||
|
from sklearn.neighbors import KNeighborsClassifier
|
||||||
|
from sklearn.tree import DecisionTreeClassifier
|
||||||
|
from sklearn.preprocessing import LabelEncoder
|
||||||
|
|
||||||
|
FEATURES = ['Flipper Length (mm)','Body Mass (g)','Culmen Depth (mm)','Culmen Length (mm)', 'Species']
|
||||||
|
|
||||||
|
def load_dataframe():
|
||||||
|
try:
|
||||||
|
column_list = FEATURES
|
||||||
|
df = pd.read_csv("penguins.csv", usecols = column_list)
|
||||||
|
return df
|
||||||
|
except FileNotFoundError:
|
||||||
|
print("Datei 'penguins.csv' nicht gefunden.")
|
||||||
|
return None
|
||||||
|
|
||||||
|
def calc_precision(tp, fp):
|
||||||
|
return tp / (tp + fp)
|
||||||
|
|
||||||
|
def calc_recall(tp, fn):
|
||||||
|
return tp / (tp + fn)
|
||||||
|
|
||||||
|
def calc_f1_score(y_true, y_pred):
|
||||||
|
#https://stackoverflow.com/questions/64860091/computing-macro-average-f1-score-using-numpy-pythonwithout-using-scikit-learn
|
||||||
|
tp = np.sum(np.multiply([i==True for i in y_pred], y_true))
|
||||||
|
tn = np.sum(np.multiply([i==False for i in y_pred], [not(j) for j in y_true]))
|
||||||
|
fp = np.sum(np.multiply([i==True for i in y_pred], [not(j) for j in y_true]))
|
||||||
|
fn = np.sum(np.multiply([i==False for i in y_pred], y_true))
|
||||||
|
precision = calc_precision(tp, fp)
|
||||||
|
recall = calc_recall(tp, fn)
|
||||||
|
|
||||||
|
if precision != 0 and recall != 0:
|
||||||
|
f1 = (2 * precision * recall) / (precision + recall)
|
||||||
|
else:
|
||||||
|
f1 = 0
|
||||||
|
return f1
|
||||||
|
|
||||||
|
|
||||||
|
def calc_f1_macro(y_true, y_pred):
|
||||||
|
f1_scores = []
|
||||||
|
for column in y_true:
|
||||||
|
score = calc_f1_score(y_true[column].values, y_pred[column])
|
||||||
|
f1_scores.append(score)
|
||||||
|
return np.mean(f1_scores)
|
||||||
|
|
||||||
|
def get_penguin_from_cli():
|
||||||
|
try:
|
||||||
|
culmen_depth = float(input("Culmen Depth (mm): "))
|
||||||
|
culmen_length = float(input("Culmen Length (mm): "))
|
||||||
|
return np.array([culmen_depth, culmen_length]).reshape(1, -1)
|
||||||
|
except ValueError:
|
||||||
|
print("Invalid input. Please enter numeric values.")
|
||||||
|
return None
|
||||||
|
|
||||||
|
def main():
|
||||||
|
df = load_dataframe()
|
||||||
|
if df is None:
|
||||||
|
return
|
||||||
|
|
||||||
|
print("\n=== Overview ===")
|
||||||
|
print(df.describe())
|
||||||
|
print(df.head())
|
||||||
|
print(df.head().info())
|
||||||
|
|
||||||
|
print("\n=== Quality Assessment ===")
|
||||||
|
row_count = len(df)
|
||||||
|
print("Number of rows:", row_count)
|
||||||
|
|
||||||
|
# check min, max, mean ...
|
||||||
|
# See df.describe() above
|
||||||
|
|
||||||
|
print("Check for null-values:", df.isnull().sum())
|
||||||
|
|
||||||
|
print("\n=== Preprocessing ===")
|
||||||
|
# fill null-values with mean
|
||||||
|
df.fillna(df.mean(numeric_only=True), inplace=True)
|
||||||
|
|
||||||
|
# transform species column to numbers
|
||||||
|
labelencoder = LabelEncoder()
|
||||||
|
df['Species'] = labelencoder.fit_transform(df['Species'])
|
||||||
|
|
||||||
|
print("\n=== Countplot ===")
|
||||||
|
# Countplot check for the balancing of the data
|
||||||
|
sns.countplot(x = df['Species'])
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
print("\n=== Heatmap ===")
|
||||||
|
# Check correlation among other variables
|
||||||
|
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
print("\n=== Feature Selection ===")
|
||||||
|
features = ['Culmen Depth (mm)', 'Culmen Length (mm)']
|
||||||
|
|
||||||
|
y = df['Species']
|
||||||
|
X = df[features]
|
||||||
|
y = pd.get_dummies(y)
|
||||||
|
|
||||||
|
print("\n=== Visualize Features ===")
|
||||||
|
sns.scatterplot(x=df['Culmen Length (mm)'], y=df['Culmen Depth (mm)'], hue=df['Species'])
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
print("\n=== Model Training ===")
|
||||||
|
# Split data into 60/40 (train/test)
|
||||||
|
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)
|
||||||
|
# Create a RandomForestClassifier with n_estimators=700
|
||||||
|
random_forest = RandomForestClassifier(n_estimators=700, random_state=0)
|
||||||
|
|
||||||
|
# Create a DecisionTreeClassifier
|
||||||
|
decision_tree = DecisionTreeClassifier(random_state=0)
|
||||||
|
|
||||||
|
# Create a KNeighborsClassifier with n_neighbors=5
|
||||||
|
k_neighbors = KNeighborsClassifier(n_neighbors=5)
|
||||||
|
|
||||||
|
models = {
|
||||||
|
"Random Forest Classifier": random_forest,
|
||||||
|
"Decision Tree Classifier": decision_tree,
|
||||||
|
"K-Neighbors": k_neighbors
|
||||||
|
}
|
||||||
|
|
||||||
|
for name, model in models.items():
|
||||||
|
model.fit(X_train.values, y_train.values)
|
||||||
|
|
||||||
|
print("\n=== Model Evaluation ===")
|
||||||
|
for name, model in models.items():
|
||||||
|
pred = model.predict(X_test.values)
|
||||||
|
|
||||||
|
# Hint: calc_f1_macro expects "pred" to be a DataFrame --> pd.DataFrame(pred)
|
||||||
|
my_f1_macro_score = calc_f1_macro(y_test, pd.DataFrame(pred))
|
||||||
|
print(f'My F1 score of {name} is {my_f1_macro_score}')
|
||||||
|
|
||||||
|
f1_sklearn = f1_score(y_test.values, pred, average='macro')
|
||||||
|
print(f'Sklearn F1 score of {name} is {f1_sklearn}')
|
||||||
|
|
||||||
|
print("\n=== Prediction ===")
|
||||||
|
# Culmen Depth (mm) = 18, Culmen Length (mm) = 50
|
||||||
|
#wild_penguin = np.array([18, 50]).reshape(1, -1)
|
||||||
|
wild_penguin = get_penguin_from_cli()
|
||||||
|
|
||||||
|
for name, model in models.items():
|
||||||
|
pred = model.predict(wild_penguin)
|
||||||
|
species_number = pd.DataFrame(pred).idxmax(axis=1)
|
||||||
|
species = labelencoder.inverse_transform(species_number)[0]
|
||||||
|
print(f'{name}: Dieser Pinguin gehört der Spezies "{species}" an')
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@ -12,19 +12,19 @@ from sklearn.preprocessing import LabelEncoder
|
|||||||
FEATURES = ['Flipper Length (mm)','Body Mass (g)','Culmen Depth (mm)','Culmen Length (mm)', 'Species']
|
FEATURES = ['Flipper Length (mm)','Body Mass (g)','Culmen Depth (mm)','Culmen Length (mm)', 'Species']
|
||||||
|
|
||||||
def load_dataframe():
|
def load_dataframe():
|
||||||
try:
|
try:
|
||||||
column_list = FEATURES
|
column_list = FEATURES
|
||||||
df = pd.read_csv("penguins.csv", usecols=column_list)
|
df = pd.read_csv("penguins.csv", usecols = column_list)
|
||||||
return df
|
return df
|
||||||
except FileNotFoundError:
|
except FileNotFoundError:
|
||||||
print("Datei 'penguins.csv' nicht gefunden.")
|
print("Datei 'penguins.csv' nicht gefunden.")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def calc_precision(tp, fp):
|
def calc_precision(tp, fp):
|
||||||
print("🛠️ under construction")
|
return tp / (tp + fp)
|
||||||
|
|
||||||
def calc_recall(tp, fn):
|
def calc_recall(tp, fn):
|
||||||
print("🛠️ under construction")
|
return tp / (tp + fn)
|
||||||
|
|
||||||
def calc_f1_score(y_true, y_pred):
|
def calc_f1_score(y_true, y_pred):
|
||||||
#https://stackoverflow.com/questions/64860091/computing-macro-average-f1-score-using-numpy-pythonwithout-using-scikit-learn
|
#https://stackoverflow.com/questions/64860091/computing-macro-average-f1-score-using-numpy-pythonwithout-using-scikit-learn
|
||||||
@ -32,7 +32,13 @@ def calc_f1_score(y_true, y_pred):
|
|||||||
tn = np.sum(np.multiply([i==False for i in y_pred], [not(j) for j in y_true]))
|
tn = np.sum(np.multiply([i==False for i in y_pred], [not(j) for j in y_true]))
|
||||||
fp = np.sum(np.multiply([i==True for i in y_pred], [not(j) for j in y_true]))
|
fp = np.sum(np.multiply([i==True for i in y_pred], [not(j) for j in y_true]))
|
||||||
fn = np.sum(np.multiply([i==False for i in y_pred], y_true))
|
fn = np.sum(np.multiply([i==False for i in y_pred], y_true))
|
||||||
print("🛠️ under construction")
|
precision = calc_precision(tp, fp)
|
||||||
|
recall = calc_recall(tp, fn)
|
||||||
|
if precision != 0 and recall != 0:
|
||||||
|
f1 = (2 * precision * recall) / (precision + recall)
|
||||||
|
else:
|
||||||
|
f1 = 0
|
||||||
|
return f1
|
||||||
|
|
||||||
|
|
||||||
def calc_f1_macro(y_true, y_pred):
|
def calc_f1_macro(y_true, y_pred):
|
||||||
@ -62,9 +68,9 @@ def main():
|
|||||||
print(df.head().info())
|
print(df.head().info())
|
||||||
|
|
||||||
print("\n=== Quality Assessment ===")
|
print("\n=== Quality Assessment ===")
|
||||||
row_count = len(df)
|
row_count = len(df)
|
||||||
print("Number of rows ", row_count)
|
print("Number of rows:", row_count)
|
||||||
print("Check for null-values ", df.isnull().sum())
|
print("Check for null-values:", df.isnull().sum())
|
||||||
|
|
||||||
print("\n=== Preprocessing ===")
|
print("\n=== Preprocessing ===")
|
||||||
# fill null-values with mean
|
# fill null-values with mean
|
||||||
@ -76,7 +82,7 @@ def main():
|
|||||||
|
|
||||||
print("\n=== Countplot ===")
|
print("\n=== Countplot ===")
|
||||||
# Countplot check for the balancing of the data
|
# Countplot check for the balancing of the data
|
||||||
sns.countplot(x=df["Species"])
|
sns.countplot(x = df["Species"])
|
||||||
plt.show()
|
plt.show()
|
||||||
|
|
||||||
print("\n=== Heatmap ===")
|
print("\n=== Heatmap ===")
|
||||||
@ -85,7 +91,8 @@ def main():
|
|||||||
plt.show()
|
plt.show()
|
||||||
|
|
||||||
print("\n=== Feature Selection ===")
|
print("\n=== Feature Selection ===")
|
||||||
features = ['Culmen Depth (mm)','Culmen Length (mm)']
|
features = ['Culmen Depth (mm)', 'Culmen Length (mm)']
|
||||||
|
|
||||||
y = df["Species"]
|
y = df["Species"]
|
||||||
X = df[features]
|
X = df[features]
|
||||||
y = pd.get_dummies(y)
|
y = pd.get_dummies(y)
|
||||||
@ -95,16 +102,40 @@ def main():
|
|||||||
plt.show()
|
plt.show()
|
||||||
|
|
||||||
print("\n=== Model Training ===")
|
print("\n=== Model Training ===")
|
||||||
print("🛠️ under construction")
|
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)
|
||||||
|
random_forest = RandomForestClassifier(n_estimators=700, random_state=0)
|
||||||
|
decision_tree = DecisionTreeClassifier(random_state=0)
|
||||||
|
k_neighbors = KNeighborsClassifier(n_neighbors=5)
|
||||||
|
|
||||||
|
models = {
|
||||||
|
"Random Forest Classifier": random_forest,
|
||||||
|
"Decision Tree Classifier": decision_tree,
|
||||||
|
"K-Neighbors": k_neighbors
|
||||||
|
}
|
||||||
|
|
||||||
|
for name, model in models.items():
|
||||||
|
model.fit(X_train.values, y_train.values)
|
||||||
|
|
||||||
print("\n=== Model Evaluation ===")
|
print("\n=== Model Evaluation ===")
|
||||||
print("🛠️ under construction")
|
for name, model in models.items():
|
||||||
|
pred = model.predict(X_test.values)
|
||||||
|
|
||||||
|
my_f1_macro_score = calc_f1_macro(y_test, pd.DataFrame(pred))
|
||||||
|
print(f"My F1 score of {name} is {my_f1_macro_score}")
|
||||||
|
|
||||||
|
f1_sklearn = f1_score(y_test.values, pred, average="macro")
|
||||||
|
print(f"Sklearn F1 score of {name} is {f1_sklearn}")
|
||||||
|
|
||||||
print("\n=== Prediction ===")
|
print("\n=== Prediction ===")
|
||||||
# Culmen Depth (mm) = 18, Culmen Length (mm) = 50
|
# Culmen Depth (mm) = 18, Culmen Length (mm) = 50
|
||||||
#wild_penguin = np.array([18, 50]).reshape(1, -1)
|
#wild_penguin = np.array([18, 50]).reshape(1, -1)
|
||||||
#wild_penguin = get_penguin_from_cli()
|
wild_penguin = get_penguin_from_cli()
|
||||||
print("🛠️ under construction")
|
|
||||||
|
for name, model in models.items():
|
||||||
|
pred = model.predict(wild_penguin)
|
||||||
|
species_number = pd.DataFrame(pred).idxmax(axis=1)
|
||||||
|
species = label_encoder.inverse_transform(species_number)[0]
|
||||||
|
print(f"{name}: Dieser Pinguin gehört der Spezies '{species}' an")
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user