PropertiesPerScrape erstellt und ein paar Edits

2024-10-24 20:04:22 +02:00 · 2024-10-24 20:04:22 +02:00 · e8830c32e6
commit e8830c32e6
parent f7b62f4e4c
9 changed files with 177 additions and 7 deletions
--- a/Data_Analysis.py
+++ b/Data_Analysis.py
@ -36,6 +36,33 @@ def getDataFromDB(propId):

    return scrapeDates, calendarData

+def getUniqueScrapeDates():
+    db = MySQLdb.connect(host="localhost",user="root",passwd="admin",db="consultancy")
+    cur = db.cursor()
+
+    cur.execute("SELECT JSON_EXTRACT(header, '$.Date') "
+                "FROM extractions "
+                f"WHERE type='calendar'")
+    uniqueScrapeDates = cur.fetchall()
+    db.close()
+
+    return uniqueScrapeDates
+
+def getPropsPerScrape(scrapeDate):
+    date = datetime.strptime(scrapeDate, '%Y-%m-%d')
+    end_date = date + timedelta(days=1)
+
+    db = MySQLdb.connect(host="localhost",user="root",passwd="admin",db="consultancy")
+    cur = db.cursor()
+
+    cur.execute("SELECT property_id "
+                "FROM extractions "
+                f"WHERE type='calendar' AND created_at > '{scrapeDate}' AND created_at < '{str(end_date)}'")
+    uniqueScrapeDates = cur.fetchall()
+    db.close()
+
+    return uniqueScrapeDates
+
 def getuniquePropIdFromDB():
    '''
    Function to get unique propId from MySQL database
--- a/pycache/Data_Analysis.cpython-312.pyc
+++ b/pycache/Data_Analysis.cpython-312.pyc
--- a/accuracyPerLocation.py
+++ b/accuracyPerLocation.py
@ -44,7 +44,7 @@ accuracyOverview.insert(0, "Heidi Mean", heidiMean, True)

 accuracyOverview.drop(index=accuracyOverview.index[0], axis=0, inplace=True)
 accuracyOverview.drop(index=accuracyOverview.index[0], axis=0, inplace=True)
-accuracyOverview.to_csv('results/accuracyOverview.csv', index=False)
+accuracyOverview.to_csv('results/accuracyOverview.csv', index=True)

 #delete unused DF's
 del merge, accuracy, propData
--- a/createLostPropertyList.py
+++ b/createLostPropertyList.py
@ -17,4 +17,4 @@ with open('results/allLostProperties', 'w') as f:
    write = csv.writer(f)
    write.writerow(lostProperties)

-#Output: 221 of 1552 properties are lost
+#Output: 221 of 1552 properties were lost at some point
--- a/createPropertiesPerScrape.py
+++ b/createPropertiesPerScrape.py
@ -0,0 +1,32 @@
+import Data_Analysis as DA
+import pandas as pd
+
+#Alle Scrape Dates auslesen, umformatieren und doppelte Löschen
+uniqueScrapeDates = DA.getUniqueScrapeDates()
+uniqueScrapeDates = DA.reformatScrapeDates(uniqueScrapeDates)
+uniqueScrapeDates= list(dict.fromkeys(uniqueScrapeDates))
+#print(uniqueScrapeDates)
+
+#Liste der Listen der properties pro Scrape Datum erstellen
+fullPropList = []
+for date in uniqueScrapeDates:
+    propList = []
+    strDate = date
+    properties = DA.getPropsPerScrape(strDate)
+    for prop in properties:
+        propList.append(prop[0])
+        propList = list(dict.fromkeys(propList))
+    fullPropList.append(propList)
+    #print(propList)
+print(fullPropList)
+
+#zu DF umwandeln, mit Property ID's in the Spaltennamen und One-Hot-Encoding
+all_property_ids = sorted(set([item for sublist in fullPropList for item in sublist]))
+print(all_property_ids)
+df = pd.DataFrame(0, index=range(len(fullPropList)), columns=all_property_ids)
+for i, property_list in enumerate(fullPropList):
+    df.loc[i, property_list] = 1
+
+df.to_csv('results/PropertiesPerScrape.csv', index=True)
+
+print(df)
--- a/propCountperScrape.py
+++ b/propCountperScrape.py
@ -0,0 +1,63 @@
+import Data_Analysis as DA
+import pandas as pd
+import matplotlib.pyplot as plt
+
+#DF einlesen
+propPerScrape = pd.read_csv(f'results/PropertiesPerScrape.csv')
+propPerScrape.drop(columns=propPerScrape.columns[0], axis=1,  inplace=True)
+#DF Transponieren, dass es dasselbe Format wie die Propdata hat
+propPerScrape = propPerScrape.T
+#Index als property_id angeben und zu int umwandeln für merge
+propPerScrape['property_id'] = propPerScrape.index
+propPerScrape.property_id = propPerScrape.property_id.astype(int)
+#print(propPerScrape)
+
+
+#Propdata ziehen und für merge vorbereiten
+propData = DA.getPropertyDataFromDB()
+propData = pd.DataFrame(propData, columns =['property_id', 'region', 'geoLocation'])
+propData = propData.drop(columns=['geoLocation'])
+propData.property_id = propData.property_id.astype(int)
+#print(propData)
+
+
+#DF's mergen
+merged_df = pd.merge(propData, propPerScrape, on='property_id', how='right')
+#print(merged_df)
+
+
+#sub-DF's erstellen für die einzelnen Regionen
+heidiProp = merged_df[merged_df['region'] == 1]
+davosProp = merged_df[merged_df['region'] == 2]
+EngadProp = merged_df[merged_df['region'] == 3]
+StMorProp = merged_df[merged_df['region'] == 4]
+
+
+
+dfList = [heidiProp, davosProp, EngadProp, StMorProp]
+outList = []
+maxList = []
+for df in dfList:
+    df = df.drop('property_id', axis=1)
+    df = df.drop('region', axis=1)
+    df = df.sum()
+    maxList.append(df.max())
+    outList.append(df)
+
+print(maxList)
+#Heidi: 313, Davos: 296, Engadin: 597, St.Moritz: 338
+
+for series in outList:
+    plt.plot(series)
+
+
+ax = plt.gca()
+ax.set_xlim([0, 47])
+plt.xlabel('Scrape number')
+plt.ylabel('number of properties')
+plt.legend(["Heidiland", "Davos", "Engadin", "St. Moritz"], loc='upper left')
+plt.savefig("results/Number_of_properties_over_Scrapes.png")
+
+plt.show()
+
+plt.draw()
--- a/results/Number_of_properties_over_Scrapes.png
+++ b/results/Number_of_properties_over_Scrapes.png
--- a/results/PropertiesPerScrape.csv
+++ b/results/PropertiesPerScrape.csv
--- a/results/accuracyOverview.csv
+++ b/results/accuracyOverview.csv
@ -1,5 +1,5 @@
-Heidi Mean,Heidi StdDev,Davos Mean,Davos StdDev,Engadin Mean,Engadin StdDev,St. Moritz Mean,St. Moritz StdDev
-0.8205301612054612,0.03521328245140846,0.8399836284786809,0.048358617863451414,0.8584327389672194,0.05319145459441233,0.8405512800767019,0.05180554811101561
-0.8066005018861457,0.06818803676300687,0.830601813557425,0.04949425409715446,0.8484564978404832,0.05396669349535696,0.8289395302705753,0.05637417919934374
-0.7368379473832369,0.06546064555588836,0.7598050837068276,0.06886580034893092,0.7667137312752639,0.06523018886732877,0.7565382226489596,0.06984023355676583
-0.6590943554763651,0.09741268862524224,0.6767196066764449,0.09656146924686429,0.670509578923442,0.07935806376665934,0.6633952429541463,0.08233444282881987
+,Heidi Mean,Heidi StdDev,Davos Mean,Davos StdDev,Engadin Mean,Engadin StdDev,St. Moritz Mean,St. Moritz StdDev
+timedelay_1,0.8205301612054612,0.03521328245140846,0.8399836284786809,0.048358617863451414,0.8584327389672194,0.05319145459441233,0.8405512800767019,0.05180554811101561
+timedelay_2,0.8066005018861457,0.06818803676300687,0.830601813557425,0.04949425409715446,0.8484564978404832,0.05396669349535696,0.8289395302705753,0.05637417919934374
+timedelay_10,0.7368379473832369,0.06546064555588836,0.7598050837068276,0.06886580034893092,0.7667137312752639,0.06523018886732877,0.7565382226489596,0.06984023355676583
+timedelay_20,0.6590943554763651,0.09741268862524224,0.6767196066764449,0.09656146924686429,0.670509578923442,0.07935806376665934,0.6633952429541463,0.08233444282881987