import pandas as pd import numpy as np import matplotlib.pyplot as plt from sklearn import datasets, linear_model from sklearn.model_selection import train_test_split from sklearn.linear_model import LinearRegression from sklearn.model_selection import GridSearchCV from sklearn.ensemble import GradientBoostingRegressor from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score FEATURE_NAME = "Flipper Length (mm)" TARGET_NAME = "Body Mass (g)" def load_dataframe(): try: column_list = [FEATURE_NAME, TARGET_NAME] df = pd.read_csv("penguins.csv", usecols = column_list) return df except FileNotFoundError: print("Datei 'penguins.csv' nicht gefunden.") return None def plot(X,Y): plt.scatter(X, Y, label="data points") plt.xlabel(FEATURE_NAME) plt.ylabel(TARGET_NAME) plt.legend() plt.show() def plot_linear_regression(X,Y, x, y): plt.plot(x,y, "r", label="lineare regression") plt.scatter(X, Y, label="data points") plt.xlabel(FEATURE_NAME) plt.ylabel(TARGET_NAME) plt.legend() plt.show() def calc_linear_regression_by_hand(X,Y): print("🛠️ under construction") # calculate mean value of X mean_x = np.mean(X) # calculate mean value of Y mean_y = np.mean(Y) numer = 0 denom = 0 # for loop for i in range(len(X)): numer += (X[i]-mean_x) * (Y[i]-mean_y) denom += (X[i]-mean_x) ** 2 m = numer / denom # calculate c c = mean_y - (m * mean_x) return m, c def calc_r2(X, Y, m, c): print("🛠️ under construction") # total sum of square ss_t = 0 # total sum of square of residuals ss_r = 0 # for loop # calc r2 r2 = 0 return r2 def calc_linear_regression(X_train, y_train): print("🛠️ under construction") def grid_search(X_train, y_train, params): model = GradientBoostingRegressor(random_state=42) grid = GridSearchCV(model, params, cv=5, scoring='r2', return_train_score=True) grid.fit(X_train, y_train) return grid def main(): df = load_dataframe() if df is None: return print("\n=== Überblick Penguins Dataset ===") print(df.describe()) print("\n=== Data Quality Assessment ===") row_count = len(df) print("number of rows:", row_count) df.dropna(inplace=True) df.fillna(df.mean(), inplace=True) print("\n=== Lineare Regression von Hand ===") X = df[[FEATURE_NAME]].values Y = df[[TARGET_NAME]].values m, c = calc_linear_regression_by_hand(X,Y) print(f"m = {m}\nc = {c}") min_x = np.min(X) max_x = np.max(X) x = np.linspace(min_x, max_x, 100) y = c+m*x plot_linear_regression(X, Y, x, y) print("\n=== Genauigkeit/Zuverlässigkeit der Formel prüfen ===") print("🐧 Not yet implemented — the penguin is still coding...") # r2 = calc_r2(X, Y, m, c) print("\n=== Lineare Regression mit SciKit ===") print("🐧 Not yet implemented — the penguin is still coding...") # model = calc_linear_regression(X_train, y_train) print("\n=== Genauigkeit/Zuverlässigkeit des Modells prüfen ===") print("🐧 Not yet implemented — the penguin is still coding...") mae = 0 mse = 0 rmse = 0 r2 = 0 print('MAE (Mean Absolute Error):', mae) print('MSE (Mean Squared Error):', mse) print('RMSE (Root Mean Squared Error):', rmse) print("R2 Score:", r2) print("\n=== Grid Search ===") print("🐧 Not yet implemented — the penguin is still coding...") params = dict() params['n_estimators'] = [] params['learning_rate'] = [] params['subsample'] = [] params['max_depth'] = [] #grid = grid_search(X_train, y_train, params) #y_pred = grid.predict(X_test) #print("Best estimator across ALL searched params:\n",grid.best_estimator_) #print("Best score across ALL searched params:\n",grid.best_score_) #print("Best parameters across ALL searched params:\n",grid.best_params_) if __name__ == "__main__": main()