PropertiesPerScrape erstellt und ein paar Edits

refactor-to-mono
mmaurostoffel 2024-10-24 20:04:22 +02:00
parent f7b62f4e4c
commit e8830c32e6
9 changed files with 177 additions and 7 deletions

View File

@ -36,6 +36,33 @@ def getDataFromDB(propId):
return scrapeDates, calendarData
def getUniqueScrapeDates():
db = MySQLdb.connect(host="localhost",user="root",passwd="admin",db="consultancy")
cur = db.cursor()
cur.execute("SELECT JSON_EXTRACT(header, '$.Date') "
"FROM extractions "
f"WHERE type='calendar'")
uniqueScrapeDates = cur.fetchall()
db.close()
return uniqueScrapeDates
def getPropsPerScrape(scrapeDate):
date = datetime.strptime(scrapeDate, '%Y-%m-%d')
end_date = date + timedelta(days=1)
db = MySQLdb.connect(host="localhost",user="root",passwd="admin",db="consultancy")
cur = db.cursor()
cur.execute("SELECT property_id "
"FROM extractions "
f"WHERE type='calendar' AND created_at > '{scrapeDate}' AND created_at < '{str(end_date)}'")
uniqueScrapeDates = cur.fetchall()
db.close()
return uniqueScrapeDates
def getuniquePropIdFromDB():
'''
Function to get unique propId from MySQL database

View File

@ -44,7 +44,7 @@ accuracyOverview.insert(0, "Heidi Mean", heidiMean, True)
accuracyOverview.drop(index=accuracyOverview.index[0], axis=0, inplace=True)
accuracyOverview.drop(index=accuracyOverview.index[0], axis=0, inplace=True)
accuracyOverview.to_csv('results/accuracyOverview.csv', index=False)
accuracyOverview.to_csv('results/accuracyOverview.csv', index=True)
#delete unused DF's
del merge, accuracy, propData

View File

@ -17,4 +17,4 @@ with open('results/allLostProperties', 'w') as f:
write = csv.writer(f)
write.writerow(lostProperties)
#Output: 221 of 1552 properties are lost
#Output: 221 of 1552 properties were lost at some point

View File

@ -0,0 +1,32 @@
import Data_Analysis as DA
import pandas as pd
#Alle Scrape Dates auslesen, umformatieren und doppelte Löschen
uniqueScrapeDates = DA.getUniqueScrapeDates()
uniqueScrapeDates = DA.reformatScrapeDates(uniqueScrapeDates)
uniqueScrapeDates= list(dict.fromkeys(uniqueScrapeDates))
#print(uniqueScrapeDates)
#Liste der Listen der properties pro Scrape Datum erstellen
fullPropList = []
for date in uniqueScrapeDates:
propList = []
strDate = date
properties = DA.getPropsPerScrape(strDate)
for prop in properties:
propList.append(prop[0])
propList = list(dict.fromkeys(propList))
fullPropList.append(propList)
#print(propList)
print(fullPropList)
#zu DF umwandeln, mit Property ID's in the Spaltennamen und One-Hot-Encoding
all_property_ids = sorted(set([item for sublist in fullPropList for item in sublist]))
print(all_property_ids)
df = pd.DataFrame(0, index=range(len(fullPropList)), columns=all_property_ids)
for i, property_list in enumerate(fullPropList):
df.loc[i, property_list] = 1
df.to_csv('results/PropertiesPerScrape.csv', index=True)
print(df)

63
propCountperScrape.py Normal file
View File

@ -0,0 +1,63 @@
import Data_Analysis as DA
import pandas as pd
import matplotlib.pyplot as plt
#DF einlesen
propPerScrape = pd.read_csv(f'results/PropertiesPerScrape.csv')
propPerScrape.drop(columns=propPerScrape.columns[0], axis=1, inplace=True)
#DF Transponieren, dass es dasselbe Format wie die Propdata hat
propPerScrape = propPerScrape.T
#Index als property_id angeben und zu int umwandeln für merge
propPerScrape['property_id'] = propPerScrape.index
propPerScrape.property_id = propPerScrape.property_id.astype(int)
#print(propPerScrape)
#Propdata ziehen und für merge vorbereiten
propData = DA.getPropertyDataFromDB()
propData = pd.DataFrame(propData, columns =['property_id', 'region', 'geoLocation'])
propData = propData.drop(columns=['geoLocation'])
propData.property_id = propData.property_id.astype(int)
#print(propData)
#DF's mergen
merged_df = pd.merge(propData, propPerScrape, on='property_id', how='right')
#print(merged_df)
#sub-DF's erstellen für die einzelnen Regionen
heidiProp = merged_df[merged_df['region'] == 1]
davosProp = merged_df[merged_df['region'] == 2]
EngadProp = merged_df[merged_df['region'] == 3]
StMorProp = merged_df[merged_df['region'] == 4]
dfList = [heidiProp, davosProp, EngadProp, StMorProp]
outList = []
maxList = []
for df in dfList:
df = df.drop('property_id', axis=1)
df = df.drop('region', axis=1)
df = df.sum()
maxList.append(df.max())
outList.append(df)
print(maxList)
#Heidi: 313, Davos: 296, Engadin: 597, St.Moritz: 338
for series in outList:
plt.plot(series)
ax = plt.gca()
ax.set_xlim([0, 47])
plt.xlabel('Scrape number')
plt.ylabel('number of properties')
plt.legend(["Heidiland", "Davos", "Engadin", "St. Moritz"], loc='upper left')
plt.savefig("results/Number_of_properties_over_Scrapes.png")
plt.show()
plt.draw()

Binary file not shown.

After

Width:  |  Height:  |  Size: 35 KiB

File diff suppressed because one or more lines are too long

View File

@ -1,5 +1,5 @@
Heidi Mean,Heidi StdDev,Davos Mean,Davos StdDev,Engadin Mean,Engadin StdDev,St. Moritz Mean,St. Moritz StdDev
0.8205301612054612,0.03521328245140846,0.8399836284786809,0.048358617863451414,0.8584327389672194,0.05319145459441233,0.8405512800767019,0.05180554811101561
0.8066005018861457,0.06818803676300687,0.830601813557425,0.04949425409715446,0.8484564978404832,0.05396669349535696,0.8289395302705753,0.05637417919934374
0.7368379473832369,0.06546064555588836,0.7598050837068276,0.06886580034893092,0.7667137312752639,0.06523018886732877,0.7565382226489596,0.06984023355676583
0.6590943554763651,0.09741268862524224,0.6767196066764449,0.09656146924686429,0.670509578923442,0.07935806376665934,0.6633952429541463,0.08233444282881987
,Heidi Mean,Heidi StdDev,Davos Mean,Davos StdDev,Engadin Mean,Engadin StdDev,St. Moritz Mean,St. Moritz StdDev
timedelay_1,0.8205301612054612,0.03521328245140846,0.8399836284786809,0.048358617863451414,0.8584327389672194,0.05319145459441233,0.8405512800767019,0.05180554811101561
timedelay_2,0.8066005018861457,0.06818803676300687,0.830601813557425,0.04949425409715446,0.8484564978404832,0.05396669349535696,0.8289395302705753,0.05637417919934374
timedelay_10,0.7368379473832369,0.06546064555588836,0.7598050837068276,0.06886580034893092,0.7667137312752639,0.06523018886732877,0.7565382226489596,0.06984023355676583
timedelay_20,0.6590943554763651,0.09741268862524224,0.6767196066764449,0.09656146924686429,0.670509578923442,0.07935806376665934,0.6633952429541463,0.08233444282881987

1 Heidi Mean Heidi StdDev Davos Mean Davos StdDev Engadin Mean Engadin StdDev St. Moritz Mean St. Moritz StdDev
2 timedelay_1 0.8205301612054612 0.03521328245140846 0.8399836284786809 0.048358617863451414 0.8584327389672194 0.05319145459441233 0.8405512800767019 0.05180554811101561
3 timedelay_2 0.8066005018861457 0.06818803676300687 0.830601813557425 0.04949425409715446 0.8484564978404832 0.05396669349535696 0.8289395302705753 0.05637417919934374
4 timedelay_10 0.7368379473832369 0.06546064555588836 0.7598050837068276 0.06886580034893092 0.7667137312752639 0.06523018886732877 0.7565382226489596 0.06984023355676583
5 timedelay_20 0.6590943554763651 0.09741268862524224 0.6767196066764449 0.09656146924686429 0.670509578923442 0.07935806376665934 0.6633952429541463 0.08233444282881987