diff --git a/classification.py b/classification.py deleted file mode 100644 index a173aff..0000000 --- a/classification.py +++ /dev/null @@ -1,139 +0,0 @@ -import pandas as pd -import seaborn as sb -import matplotlib.pyplot as plt -from sklearn.preprocessing import LabelEncoder -from sklearn.model_selection import train_test_split -from sklearn.ensemble import RandomForestClassifier -from sklearn.neighbors import KNeighborsClassifier -from sklearn.tree import DecisionTreeClassifier -from sklearn.metrics import f1_score - -CLASS = "Aktivitätskategorie" -FEATURES = [ - "Distanz km", - "Ø Herzfrequenz", - "Maximale Herzfrequenz", - "Aerober TE", - "Ø Geschwindigkeit km/h", - "Maximale Geschwindigkeit km/h", - "Kalorien", - "Zeit" -] - -#Define class for all the classifiers and set the instance variables for the values -class Classifier: - def __init__(self, model): - self.model = model - self.f1s = [[[] for j in range(len(FEATURES)) ] for i in range(len(FEATURES))] - self.f1_means = [[] for i in range(len(FEATURES))] - self.f1_medians = [[] for i in range(len(FEATURES))] - -#Function to load the dataframe -def load_dataframe(): - file = "data/Sport_Daten.csv" - try: - columns = FEATURES.copy() # Copy the Values and dont use the same object as FEATURES - columns.append(CLASS) - data_frame = pd.read_csv(file, usecols = columns, sep=";") # Sep is required because from excel the csv is exported with ; - return data_frame - except: - return None - -#Function to create a lineplot -def show_lineplot(name_of_classifier, data_frame): - sb.lineplot(data=data_frame, x="X", y="Y", hue="Features") - plt.title(name_of_classifier + ":\n Performance mit Anzahl Features") - plt.xlabel("Anzahl Features") - plt.ylabel("F1-Mittelwert") - plt.savefig("plots/lineplot_" + "_".join(name_of_classifier.split()) + ".png") - plt.show() - -#Main function -def main(): - #Load Data Frame and check if it was successful - data_frame = load_dataframe() - if data_frame is None: - return - else: - print("✅ Load Dataframe") - - - #Create new Column and transform Activity column to numbers - labelencoder = LabelEncoder() - data_frame[CLASS] = labelencoder.fit_transform(data_frame[CLASS]) - - #Create Random Forest Classifier - random_forest = RandomForestClassifier(random_state=0) - # Create Decision Tree Classifier - decision_tree = DecisionTreeClassifier(random_state=0) - #Create K-Neighbors Classifier - k_neighbors = KNeighborsClassifier(n_neighbors=5) - - #Define Set with Classifiers and name - classifiers = { - "Random Forest Classifier": Classifier(model = random_forest), - "Decision Tree Classifier": Classifier(model = decision_tree), - "K-Nearest-Neighbors Classifier": Classifier(model = k_neighbors), - } - - #Itterate trough all combinations and calculate binary Value of integer with Array of Features - for i in range(1,2**len(FEATURES)): #from 1 to 2^number of Features - binary_value = bin(i)[2:].zfill(len(FEATURES)) #Calculate binary value and fill with zeros - features = [] - for j in range(len(FEATURES)): #Itterate trough all positions in binary code - if int(binary_value[j]) == 1: #Check if binary Value is 1 / If feature should be used - features.append(FEATURES[j]) - - #Define dataframes - y = data_frame[CLASS] - X = data_frame[features] - y = pd.get_dummies(y) - - #Split data in test and training data - X_train , X_test , y_train , y_test = train_test_split(X, y,test_size=0.2, random_state=0) - - #For all classifiers train the model, predict values and calculate f1-score - for index, (name, classifier) in enumerate(classifiers.items()): - classifier.model.fit(X_train.values , y_train.values) - pred = classifier.model.predict(X_test.values) - f1 = f1_score(y_test.values , pred, average='macro') - - #For each Features which was used to train write f1-score in matrix - for j in range(len(FEATURES)): - if int(binary_value[j]) == 1: - classifier.f1s[j][len(features)-1].append(f1) - - #print progress on console, overwrite the current value - progress = i * 100 / (2**len(FEATURES)-1) - print(f"\r Progress: {progress:.2f}%", end="", flush=True) - - - #Make calculations for all classifiers - for name, classifier in classifiers.items(): - #Calculate Average of F1 Score - for i in range(len(FEATURES)): - for j in range(len(FEATURES)): - f1s_for_nr_features = [] - f1s_for_nr_features.extend(classifier.f1s[i][j]) - mean = sum(f1s_for_nr_features) / len(f1s_for_nr_features) - classifier.f1_means[i].append(mean) - - #Create set for all the values in lineplot - rows = [] - for index_feature, feature in enumerate(FEATURES): - for x in range(len(classifier.f1_means[index_feature])): - rows.append({ - "Features": feature, - "X": x+1, - "Y": classifier.f1_means[index_feature][x] - }) - - #Create Dataframe and hand it to the lineplot function - line_plot_dataframe = pd.DataFrame(rows) - show_lineplot(name, line_plot_dataframe) - -#Start script -if __name__ == "__main__": - main() - -