import pandas as pd import seaborn as sb import matplotlib.pyplot as plt from sklearn.preprocessing import LabelEncoder from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.metrics import f1_score CLASS = "Aktivitätskategorie" FEATURES = [ "Distanz km", "Ø Herzfrequenz", "Maximale Herzfrequenz", "Aerober TE", "Ø Geschwindigkeit km/h", "Maximale Geschwindigkeit km/h", "Kalorien", "Zeit" ] #Define class for all the classifiers and set the instance variables for the values class Classifier: def __init__(self, model): self.model = model self.f1s = [[[] for j in range(len(FEATURES)) ] for i in range(len(FEATURES))] self.f1_means = [[] for i in range(len(FEATURES))] self.f1_medians = [[] for i in range(len(FEATURES))] #Function to load the dataframe def load_dataframe(): file = "data/Sport_Daten.csv" try: columns = FEATURES.copy() # Copy the Values and dont use the same object as FEATURES columns.append(CLASS) data_frame = pd.read_csv(file, usecols = columns, sep=";") # Sep is required because from excel the csv is exported with ; return data_frame except: return None #Function to create a lineplot def show_lineplot(name_of_classifier, data_frame): sb.lineplot(data=data_frame, x="X", y="Y", hue="Features") plt.title(name_of_classifier + ":\n Performance mit Anzahl Features") plt.xlabel("Anzahl Features") plt.ylabel("F1-Mittelwert") plt.savefig("plots/lineplot_" + "_".join(name_of_classifier.split()) + ".png") plt.show() #Main function def main(): #Load Data Frame and check if it was successful data_frame = load_dataframe() if data_frame is None: return else: print("✅ Load Dataframe") #Create new Column and transform Activity column to numbers labelencoder = LabelEncoder() data_frame[CLASS] = labelencoder.fit_transform(data_frame[CLASS]) #Create Random Forest Classifier random_forest = RandomForestClassifier(random_state=0) # Create Decision Tree Classifier decision_tree = DecisionTreeClassifier(random_state=0) #Create K-Neighbors Classifier k_neighbors = KNeighborsClassifier(n_neighbors=5) #Define Set with Classifiers and name classifiers = { "Random Forest Classifier": Classifier(model = random_forest), "Decision Tree Classifier": Classifier(model = decision_tree), "K-Nearest-Neighbors Classifier": Classifier(model = k_neighbors), } #Itterate trough all combinations and calculate binary Value of integer with Array of Features for i in range(1,2**len(FEATURES)): #from 1 to 2^number of Features binary_value = bin(i)[2:].zfill(len(FEATURES)) #Calculate binary value and fill with zeros features = [] for j in range(len(FEATURES)): #Itterate trough all positions in binary code if int(binary_value[j]) == 1: #Check if binary Value is 1 / If feature should be used features.append(FEATURES[j]) #Define dataframes y = data_frame[CLASS] X = data_frame[features] y = pd.get_dummies(y) #Split data in test and training data X_train , X_test , y_train , y_test = train_test_split(X, y,test_size=0.2, random_state=0) #For all classifiers train the model, predict values and calculate f1-score for index, (name, classifier) in enumerate(classifiers.items()): classifier.model.fit(X_train.values , y_train.values) pred = classifier.model.predict(X_test.values) f1 = f1_score(y_test.values , pred, average='macro') #For each Features which was used to train write f1-score in matrix for j in range(len(FEATURES)): if int(binary_value[j]) == 1: classifier.f1s[j][len(features)-1].append(f1) #print progress on console, overwrite the current value progress = i * 100 / (2**len(FEATURES)-1) print(f"\r Progress: {progress:.2f}%", end="", flush=True) #Make calculations for all classifiers for name, classifier in classifiers.items(): #Calculate Average of F1 Score for i in range(len(FEATURES)): for j in range(len(FEATURES)): f1s_for_nr_features = [] f1s_for_nr_features.extend(classifier.f1s[i][j]) mean = sum(f1s_for_nr_features) / len(f1s_for_nr_features) classifier.f1_means[i].append(mean) #Create set for all the values in lineplot rows = [] for index_feature, feature in enumerate(FEATURES): for x in range(len(classifier.f1_means[index_feature])): rows.append({ "Features": feature, "X": x+1, "Y": classifier.f1_means[index_feature][x] }) #Create Dataframe and hand it to the lineplot function line_plot_dataframe = pd.DataFrame(rows) show_lineplot(name, line_plot_dataframe) #Start script if __name__ == "__main__": main()