added folder classification and file classification_mnist_demo.py added folder datasets and updated README.md to explain what python libraries need to be installed for running all the scripts

This commit is contained in:
git-sandro 2025-11-13 12:29:37 +01:00
parent fe9b12b374
commit cb5d0149f7
5 changed files with 128 additions and 0 deletions

View File

@ -0,0 +1,8 @@
This reposetory is used for storing my datasets and scripts for data science.
Install this Python libraries in your virtual environment. Use (uv) pip install ...
numpy
matplotlib
openpyxl
scikit-learn

View File

@ -0,0 +1,119 @@
import math
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.multiclass import OneVsOneClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.datasets import fetch_openml
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import precision_score, recall_score, f1_score
# Datensatz herunterladen
print("✅ Datensatz herunterladen")
mnist = fetch_openml('mnist_784', version=1, as_frame=False, parser='auto')
mnist.keys()
X, y = mnist["data"], mnist["target"]
X.shape
# Ziffer aus dem Datensatz: 5
print("✅ Ziffer aus dem Datensatz: 5")
some_digit = X[0]
some_digit_image = some_digit.reshape(28, 28)
plt.imshow(some_digit_image, cmap=mpl.cm.binary)
plt.show()
# Ziffer aus dem Datensatz: 0
print("✅ Ziffer aus dem Datensatz: 0")
some_other_digit = X[1]
some_other_digit_image = some_other_digit.reshape(28, 28)
plt.imshow(some_other_digit_image, cmap=mpl.cm.binary)
plt.show()
# Label
print("✅ Label")
print(y[0])
y = y.astype(np.uint8)
# Zahlen Matrix
print("✅ Zahlen Matrix")
i = 1
for number in some_digit:
#28 Spalten
if i < 28:
if number > 0:
print("\x1b[31m{:03d}".format(math.trunc(number.item())), end = '\x1b[0m ')
else:
print("{:03d}".format(math.trunc(number.item())), end = ' ')
else:
print("{:03d}".format(math.trunc(number.item())))
i = 0
i = i+1
# Train-Test-Split
print("✅ Train-Test-Split")
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]
# Testdaten vorbereiten für die Klassifikation der Ziffer 5
print("✅ Testdaten vorbereiten für die Klassifikation der Ziffer 5")
y_train_5 = (y_train == 5)
y_test_5 = (y_test == 5)
print(y_train_5)
# Logistische Regression zur binären Klassifikation (Ziffer aus dem Datensatz: 5)
print("✅ Logistische Regression zur binären Klassifikation")
model_log = SGDClassifier(loss="log_loss", max_iter=1000, tol=1e-3, random_state=42)
model_log.fit(X_train, y_train_5)
model_log.predict([some_digit])
# Support Vector Machine zur binären Klassifikation (Ziffer aus dem Datensatz: 0)
print("✅ Support Vector Machine zur binären Klassifikation")
model_hinge = SGDClassifier(loss="hinge", max_iter=1000, tol=1e-3, random_state=42)
model_hinge.fit(X_train, y_train_5)
model_hinge.predict([some_other_digit])
# Evaluation
print("✅ Evaluation")
model = model_hinge
y_train_pred = cross_val_predict(model_hinge, X_train, y_train_5, cv=3)
y_test_pred = cross_val_predict(model_hinge, X_test, y_test_5, cv=3)
#precision_score(y_train_5, y_train_pred)
precision_score(y_test_5, y_test_pred)
#recall_score(y_train_5, y_train_pred)
recall_score(y_test_5, y_test_pred)
#f1_score(y_train_5, y_train_pred)
f1_score(y_test_5, y_test_pred)
# One-versus-One (OvO)
print("✅ One-versus-One (OvO)")
model_ovo = OneVsOneClassifier(SVC(gamma="auto", random_state=42))
model_ovo.fit(X_train[:100], y_train[:100])
model_ovo.predict([some_digit])
# One-versus-the-Rest (OvR)
print("✅ One-versus-the-Rest (OvR)")
model_ovr = OneVsRestClassifier(SVC(gamma="auto", random_state=42))
model_ovr.fit(X_train[:100], y_train[:100])
model_ovr.predict([some_digit])
# Multilabel Classification
print("✅ Multilabel Classification")
y_train_large = (y_train >= 7) # grosse ziffern (7,8,9)
y_train_odd = (y_train % 2 == 1) # ungerade = true, gerade = false
y_multilabel = np.c_[y_train_large, y_train_odd] # 1-D array als spalte in a 2-D array konvertieren
model_knn = KNeighborsClassifier()
model_knn.fit(X_train, y_multilabel)
# Multiclass Multioutput Classification
print("✅ Multiclass Multioutput Classification")
model_svc = SVC(gamma="auto", random_state=42)
model_svc.fit(X_train[:1000], y_train[:1000]) # y_train, not y_train_5
model_svc.predict([some_digit])
model_svc.classes_

Binary file not shown.

Binary file not shown.

View File

@ -56,6 +56,7 @@ def main():
df["finish_seconds"] = df["finish"].apply(time_str_to_seconds)
# create boxplot
df.boxplot(column=["finish_seconds"])
plt.title("Verteilung der Zielzeiten in Sekunden")
plt.ylabel("Sekunden")