PropertiesPerScrape erstellt und ein paar Edits

2024-10-24 20:04:22 +02:00 · 2024-10-24 20:04:22 +02:00 · e8830c32e6
commit e8830c32e6
parent f7b62f4e4c
9 changed files with 177 additions and 7 deletions
--- a/Data_Analysis.py
+++ b/Data_Analysis.py
@ -36,6 +36,33 @@ def getDataFromDB(propId):
    return scrapeDates, calendarData
 def getUniqueScrapeDates():
    db = MySQLdb.connect(host="localhost",user="root",passwd="admin",db="consultancy")
    cur = db.cursor()
    cur.execute("SELECT JSON_EXTRACT(header, '$.Date') "
                "FROM extractions "
                f"WHERE type='calendar'")
    uniqueScrapeDates = cur.fetchall()
    db.close()
    return uniqueScrapeDates
 def getPropsPerScrape(scrapeDate):
    date = datetime.strptime(scrapeDate, '%Y-%m-%d')
    end_date = date + timedelta(days=1)
    db = MySQLdb.connect(host="localhost",user="root",passwd="admin",db="consultancy")
    cur = db.cursor()
    cur.execute("SELECT property_id "
                "FROM extractions "
                f"WHERE type='calendar' AND created_at > '{scrapeDate}' AND created_at < '{str(end_date)}'")
    uniqueScrapeDates = cur.fetchall()
    db.close()
    return uniqueScrapeDates
 def getuniquePropIdFromDB():
    '''
    Function to get unique propId from MySQL database
--- a/pycache/Data_Analysis.cpython-312.pyc
+++ b/pycache/Data_Analysis.cpython-312.pyc
--- a/accuracyPerLocation.py
+++ b/accuracyPerLocation.py
@ -44,7 +44,7 @@ accuracyOverview.insert(0, "Heidi Mean", heidiMean, True)
 accuracyOverview.drop(index=accuracyOverview.index[0], axis=0, inplace=True)
 accuracyOverview.drop(index=accuracyOverview.index[0], axis=0, inplace=True)
-accuracyOverview.to_csv('results/accuracyOverview.csv', index=False)
+accuracyOverview.to_csv('results/accuracyOverview.csv', index=True)
 #delete unused DF's
 del merge, accuracy, propData
--- a/createLostPropertyList.py
+++ b/createLostPropertyList.py
@ -17,4 +17,4 @@ with open('results/allLostProperties', 'w') as f:
    write = csv.writer(f)
    write.writerow(lostProperties)
-#Output: 221 of 1552 properties are lost
+#Output: 221 of 1552 properties were lost at some point
--- a/createPropertiesPerScrape.py
+++ b/createPropertiesPerScrape.py
@ -0,0 +1,32 @@
 import Data_Analysis as DA
 import pandas as pd
 #Alle Scrape Dates auslesen, umformatieren und doppelte Löschen
 uniqueScrapeDates = DA.getUniqueScrapeDates()
 uniqueScrapeDates = DA.reformatScrapeDates(uniqueScrapeDates)
 uniqueScrapeDates= list(dict.fromkeys(uniqueScrapeDates))
 #print(uniqueScrapeDates)
 #Liste der Listen der properties pro Scrape Datum erstellen
 fullPropList = []
 for date in uniqueScrapeDates:
    propList = []
    strDate = date
    properties = DA.getPropsPerScrape(strDate)
    for prop in properties:
        propList.append(prop[0])
        propList = list(dict.fromkeys(propList))
    fullPropList.append(propList)
    #print(propList)
 print(fullPropList)
 #zu DF umwandeln, mit Property ID's in the Spaltennamen und One-Hot-Encoding
 all_property_ids = sorted(set([item for sublist in fullPropList for item in sublist]))
 print(all_property_ids)
 df = pd.DataFrame(0, index=range(len(fullPropList)), columns=all_property_ids)
 for i, property_list in enumerate(fullPropList):
    df.loc[i, property_list] = 1
 df.to_csv('results/PropertiesPerScrape.csv', index=True)
 print(df)
--- a/propCountperScrape.py
+++ b/propCountperScrape.py
@ -0,0 +1,63 @@
 import Data_Analysis as DA
 import pandas as pd
 import matplotlib.pyplot as plt
 #DF einlesen
 propPerScrape = pd.read_csv(f'results/PropertiesPerScrape.csv')
 propPerScrape.drop(columns=propPerScrape.columns[0], axis=1,  inplace=True)
 #DF Transponieren, dass es dasselbe Format wie die Propdata hat
 propPerScrape = propPerScrape.T
 #Index als property_id angeben und zu int umwandeln für merge
 propPerScrape['property_id'] = propPerScrape.index
 propPerScrape.property_id = propPerScrape.property_id.astype(int)
 #print(propPerScrape)
 #Propdata ziehen und für merge vorbereiten
 propData = DA.getPropertyDataFromDB()
 propData = pd.DataFrame(propData, columns =['property_id', 'region', 'geoLocation'])
 propData = propData.drop(columns=['geoLocation'])
 propData.property_id = propData.property_id.astype(int)
 #print(propData)
 #DF's mergen
 merged_df = pd.merge(propData, propPerScrape, on='property_id', how='right')
 #print(merged_df)
 #sub-DF's erstellen für die einzelnen Regionen
 heidiProp = merged_df[merged_df['region'] == 1]
 davosProp = merged_df[merged_df['region'] == 2]
 EngadProp = merged_df[merged_df['region'] == 3]
 StMorProp = merged_df[merged_df['region'] == 4]
 dfList = [heidiProp, davosProp, EngadProp, StMorProp]
 outList = []
 maxList = []
 for df in dfList:
    df = df.drop('property_id', axis=1)
    df = df.drop('region', axis=1)
    df = df.sum()
    maxList.append(df.max())
    outList.append(df)
 print(maxList)
 #Heidi: 313, Davos: 296, Engadin: 597, St.Moritz: 338
 for series in outList:
    plt.plot(series)
 ax = plt.gca()
 ax.set_xlim([0, 47])
 plt.xlabel('Scrape number')
 plt.ylabel('number of properties')
 plt.legend(["Heidiland", "Davos", "Engadin", "St. Moritz"], loc='upper left')
 plt.savefig("results/Number_of_properties_over_Scrapes.png")
 plt.show()
 plt.draw()
--- a/results/Number_of_properties_over_Scrapes.png
+++ b/results/Number_of_properties_over_Scrapes.png
--- a/results/PropertiesPerScrape.csv
+++ b/results/PropertiesPerScrape.csv
--- a/results/accuracyOverview.csv
+++ b/results/accuracyOverview.csv
@ -1,5 +1,5 @@
-Heidi Mean,Heidi StdDev,Davos Mean,Davos StdDev,Engadin Mean,Engadin StdDev,St. Moritz Mean,St. Moritz StdDev
+,Heidi Mean,Heidi StdDev,Davos Mean,Davos StdDev,Engadin Mean,Engadin StdDev,St. Moritz Mean,St. Moritz StdDev
-0.8205301612054612,0.03521328245140846,0.8399836284786809,0.048358617863451414,0.8584327389672194,0.05319145459441233,0.8405512800767019,0.05180554811101561
+timedelay_1,0.8205301612054612,0.03521328245140846,0.8399836284786809,0.048358617863451414,0.8584327389672194,0.05319145459441233,0.8405512800767019,0.05180554811101561
-0.8066005018861457,0.06818803676300687,0.830601813557425,0.04949425409715446,0.8484564978404832,0.05396669349535696,0.8289395302705753,0.05637417919934374
+timedelay_2,0.8066005018861457,0.06818803676300687,0.830601813557425,0.04949425409715446,0.8484564978404832,0.05396669349535696,0.8289395302705753,0.05637417919934374
-0.7368379473832369,0.06546064555588836,0.7598050837068276,0.06886580034893092,0.7667137312752639,0.06523018886732877,0.7565382226489596,0.06984023355676583
+timedelay_10,0.7368379473832369,0.06546064555588836,0.7598050837068276,0.06886580034893092,0.7667137312752639,0.06523018886732877,0.7565382226489596,0.06984023355676583
-0.6590943554763651,0.09741268862524224,0.6767196066764449,0.09656146924686429,0.670509578923442,0.07935806376665934,0.6633952429541463,0.08233444282881987
+timedelay_20,0.6590943554763651,0.09741268862524224,0.6767196066764449,0.09656146924686429,0.670509578923442,0.07935806376665934,0.6633952429541463,0.08233444282881987