139 lines
4.0 KiB
Python
139 lines
4.0 KiB
Python
import pandas as pd
|
|
import numpy as np
|
|
import matplotlib.pyplot as plt
|
|
from sklearn import datasets, linear_model
|
|
from sklearn.model_selection import train_test_split
|
|
from sklearn.linear_model import LinearRegression
|
|
from sklearn.model_selection import GridSearchCV
|
|
from sklearn.ensemble import GradientBoostingRegressor
|
|
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
|
|
|
|
FEATURE_NAME = "Flipper Length (mm)"
|
|
TARGET_NAME = "Body Mass (g)"
|
|
|
|
def load_dataframe():
|
|
try:
|
|
column_list = [FEATURE_NAME, TARGET_NAME]
|
|
df = pd.read_csv("penguins.csv", usecols = column_list)
|
|
return df
|
|
except FileNotFoundError:
|
|
print("Datei 'penguins.csv' nicht gefunden.")
|
|
return None
|
|
|
|
def plot(X,Y):
|
|
plt.scatter(X, Y, label="data points")
|
|
plt.xlabel(FEATURE_NAME)
|
|
plt.ylabel(TARGET_NAME)
|
|
plt.legend()
|
|
plt.show()
|
|
|
|
def plot_linear_regression(X,Y, x, y):
|
|
plt.plot(x,y, "r", label="lineare regression")
|
|
plt.scatter(X, Y, label="data points")
|
|
plt.xlabel(FEATURE_NAME)
|
|
plt.ylabel(TARGET_NAME)
|
|
plt.legend()
|
|
plt.show()
|
|
|
|
def calc_linear_regression_by_hand(X,Y):
|
|
print("🛠️ under construction")
|
|
# calculate mean value of X
|
|
mean_x = np.mean(X)
|
|
# calculate mean value of Y
|
|
mean_y = np.mean(Y)
|
|
numer = 0
|
|
denom = 0
|
|
# for loop
|
|
for i in range(len(X)):
|
|
numer += (X[i]-mean_x) * (Y[i]-mean_y)
|
|
denom += (X[i]-mean_x) ** 2
|
|
m = numer / denom
|
|
# calculate c
|
|
c = mean_y - (m * mean_x)
|
|
return m, c
|
|
|
|
def calc_r2(X, Y, m, c):
|
|
print("🛠️ under construction")
|
|
# total sum of square
|
|
ss_t = 0
|
|
# total sum of square of residuals
|
|
ss_r = 0
|
|
# for loop
|
|
# calc r2
|
|
r2 = 0
|
|
return r2
|
|
|
|
def calc_linear_regression(X_train, y_train):
|
|
print("🛠️ under construction")
|
|
|
|
def grid_search(X_train, y_train, params):
|
|
model = GradientBoostingRegressor(random_state=42)
|
|
grid = GridSearchCV(model, params, cv=5,
|
|
scoring='r2',
|
|
return_train_score=True)
|
|
grid.fit(X_train, y_train)
|
|
return grid
|
|
|
|
|
|
def main():
|
|
df = load_dataframe()
|
|
if df is None:
|
|
return
|
|
|
|
print("\n=== Überblick Penguins Dataset ===")
|
|
print(df.describe())
|
|
|
|
|
|
print("\n=== Data Quality Assessment ===")
|
|
row_count = len(df)
|
|
print("number of rows:", row_count)
|
|
df.dropna(inplace=True)
|
|
df.fillna(df.mean(), inplace=True)
|
|
|
|
|
|
print("\n=== Lineare Regression von Hand ===")
|
|
X = df[[FEATURE_NAME]].values
|
|
Y = df[[TARGET_NAME]].values
|
|
m, c = calc_linear_regression_by_hand(X,Y)
|
|
print(f"m = {m}\nc = {c}")
|
|
min_x = np.min(X)
|
|
max_x = np.max(X)
|
|
x = np.linspace(min_x, max_x, 100)
|
|
y = c+m*x
|
|
plot_linear_regression(X, Y, x, y)
|
|
|
|
print("\n=== Genauigkeit/Zuverlässigkeit der Formel prüfen ===")
|
|
print("🐧 Not yet implemented — the penguin is still coding...")
|
|
# r2 = calc_r2(X, Y, m, c)
|
|
|
|
print("\n=== Lineare Regression mit SciKit ===")
|
|
print("🐧 Not yet implemented — the penguin is still coding...")
|
|
# model = calc_linear_regression(X_train, y_train)
|
|
|
|
print("\n=== Genauigkeit/Zuverlässigkeit des Modells prüfen ===")
|
|
print("🐧 Not yet implemented — the penguin is still coding...")
|
|
mae = 0
|
|
mse = 0
|
|
rmse = 0
|
|
r2 = 0
|
|
print('MAE (Mean Absolute Error):', mae)
|
|
print('MSE (Mean Squared Error):', mse)
|
|
print('RMSE (Root Mean Squared Error):', rmse)
|
|
print("R2 Score:", r2)
|
|
|
|
print("\n=== Grid Search ===")
|
|
print("🐧 Not yet implemented — the penguin is still coding...")
|
|
params = dict()
|
|
params['n_estimators'] = []
|
|
params['learning_rate'] = []
|
|
params['subsample'] = []
|
|
params['max_depth'] = []
|
|
#grid = grid_search(X_train, y_train, params)
|
|
#y_pred = grid.predict(X_test)
|
|
#print("Best estimator across ALL searched params:\n",grid.best_estimator_)
|
|
#print("Best score across ALL searched params:\n",grid.best_score_)
|
|
#print("Best parameters across ALL searched params:\n",grid.best_params_)
|
|
|
|
if __name__ == "__main__":
|
|
main()
|