cds1011/lineare_regression/linear_regression.py

139 lines
4.0 KiB
Python

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
FEATURE_NAME = "Flipper Length (mm)"
TARGET_NAME = "Body Mass (g)"
def load_dataframe():
try:
column_list = [FEATURE_NAME, TARGET_NAME]
df = pd.read_csv("penguins.csv", usecols = column_list)
return df
except FileNotFoundError:
print("Datei 'penguins.csv' nicht gefunden.")
return None
def plot(X,Y):
plt.scatter(X, Y, label="data points")
plt.xlabel(FEATURE_NAME)
plt.ylabel(TARGET_NAME)
plt.legend()
plt.show()
def plot_linear_regression(X,Y, x, y):
plt.plot(x,y, "r", label="lineare regression")
plt.scatter(X, Y, label="data points")
plt.xlabel(FEATURE_NAME)
plt.ylabel(TARGET_NAME)
plt.legend()
plt.show()
def calc_linear_regression_by_hand(X,Y):
print("🛠️ under construction")
# calculate mean value of X
mean_x = np.mean(X)
# calculate mean value of Y
mean_y = np.mean(Y)
numer = 0
denom = 0
# for loop
for i in range(len(X)):
numer += (X[i]-mean_x) * (Y[i]-mean_y)
denom += (X[i]-mean_x) ** 2
m = numer / denom
# calculate c
c = mean_y - (m * mean_x)
return m, c
def calc_r2(X, Y, m, c):
print("🛠️ under construction")
# total sum of square
ss_t = 0
# total sum of square of residuals
ss_r = 0
# for loop
# calc r2
r2 = 0
return r2
def calc_linear_regression(X_train, y_train):
print("🛠️ under construction")
def grid_search(X_train, y_train, params):
model = GradientBoostingRegressor(random_state=42)
grid = GridSearchCV(model, params, cv=5,
scoring='r2',
return_train_score=True)
grid.fit(X_train, y_train)
return grid
def main():
df = load_dataframe()
if df is None:
return
print("\n=== Überblick Penguins Dataset ===")
print(df.describe())
print("\n=== Data Quality Assessment ===")
row_count = len(df)
print("number of rows:", row_count)
df.dropna(inplace=True)
df.fillna(df.mean(), inplace=True)
print("\n=== Lineare Regression von Hand ===")
X = df[[FEATURE_NAME]].values
Y = df[[TARGET_NAME]].values
m, c = calc_linear_regression_by_hand(X,Y)
print(f"m = {m}\nc = {c}")
min_x = np.min(X)
max_x = np.max(X)
x = np.linspace(min_x, max_x, 100)
y = c+m*x
plot_linear_regression(X, Y, x, y)
print("\n=== Genauigkeit/Zuverlässigkeit der Formel prüfen ===")
print("🐧 Not yet implemented — the penguin is still coding...")
# r2 = calc_r2(X, Y, m, c)
print("\n=== Lineare Regression mit SciKit ===")
print("🐧 Not yet implemented — the penguin is still coding...")
# model = calc_linear_regression(X_train, y_train)
print("\n=== Genauigkeit/Zuverlässigkeit des Modells prüfen ===")
print("🐧 Not yet implemented — the penguin is still coding...")
mae = 0
mse = 0
rmse = 0
r2 = 0
print('MAE (Mean Absolute Error):', mae)
print('MSE (Mean Squared Error):', mse)
print('RMSE (Root Mean Squared Error):', rmse)
print("R2 Score:", r2)
print("\n=== Grid Search ===")
print("🐧 Not yet implemented — the penguin is still coding...")
params = dict()
params['n_estimators'] = []
params['learning_rate'] = []
params['subsample'] = []
params['max_depth'] = []
#grid = grid_search(X_train, y_train, params)
#y_pred = grid.predict(X_test)
#print("Best estimator across ALL searched params:\n",grid.best_estimator_)
#print("Best score across ALL searched params:\n",grid.best_score_)
#print("Best parameters across ALL searched params:\n",grid.best_params_)
if __name__ == "__main__":
main()