Mauro Ordner gelöscht closes #16
parent
5b97c7ead2
commit
468ad94430
|
@ -1,199 +0,0 @@
|
||||||
import MySQLdb
|
|
||||||
import json
|
|
||||||
from datetime import datetime, timedelta
|
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
|
|
||||||
def getPropertyDataFromDB():
|
|
||||||
db = MySQLdb.connect(host="localhost",user="root",passwd="admin",db="consultancy")
|
|
||||||
cur = db.cursor()
|
|
||||||
|
|
||||||
cur.execute("SELECT id, seed_id, check_data "
|
|
||||||
"FROM properties ")
|
|
||||||
propData = cur.fetchall()
|
|
||||||
db.close()
|
|
||||||
|
|
||||||
return propData
|
|
||||||
|
|
||||||
def getDataFromDB(propId):
|
|
||||||
'''
|
|
||||||
Function to get data from MySQL database filter with the given propId
|
|
||||||
:return: scrapeDates and calendarData
|
|
||||||
'''
|
|
||||||
db = MySQLdb.connect(host="localhost",user="root",passwd="admin",db="consultancy")
|
|
||||||
cur = db.cursor()
|
|
||||||
|
|
||||||
cur.execute("SELECT JSON_EXTRACT(header, '$.Date') "
|
|
||||||
"FROM extractions "
|
|
||||||
f"WHERE type='calendar' AND property_id = {propId};")
|
|
||||||
scrapeDates = cur.fetchall()
|
|
||||||
|
|
||||||
cur.execute("SELECT JSON_EXTRACT(body, '$.content.days') "
|
|
||||||
"FROM extractions "
|
|
||||||
f"WHERE type='calendar' AND property_id = {propId};")
|
|
||||||
calendarData = cur.fetchall()
|
|
||||||
db.close()
|
|
||||||
|
|
||||||
return scrapeDates, calendarData
|
|
||||||
|
|
||||||
def getUniqueScrapeDates():
|
|
||||||
db = MySQLdb.connect(host="localhost",user="root",passwd="admin",db="consultancy")
|
|
||||||
cur = db.cursor()
|
|
||||||
|
|
||||||
cur.execute("SELECT JSON_EXTRACT(header, '$.Date') "
|
|
||||||
"FROM extractions "
|
|
||||||
f"WHERE type='calendar'")
|
|
||||||
uniqueScrapeDates = cur.fetchall()
|
|
||||||
db.close()
|
|
||||||
|
|
||||||
return uniqueScrapeDates
|
|
||||||
|
|
||||||
def getPropsPerScrape(scrapeDate):
|
|
||||||
date = datetime.strptime(scrapeDate, '%Y-%m-%d')
|
|
||||||
end_date = date + timedelta(days=1)
|
|
||||||
|
|
||||||
db = MySQLdb.connect(host="localhost",user="root",passwd="admin",db="consultancy")
|
|
||||||
cur = db.cursor()
|
|
||||||
|
|
||||||
cur.execute("SELECT property_id "
|
|
||||||
"FROM extractions "
|
|
||||||
f"WHERE type='calendar' AND created_at > '{scrapeDate}' AND created_at < '{str(end_date)}'")
|
|
||||||
uniqueScrapeDates = cur.fetchall()
|
|
||||||
db.close()
|
|
||||||
|
|
||||||
return uniqueScrapeDates
|
|
||||||
|
|
||||||
def getuniquePropIdFromDB():
|
|
||||||
'''
|
|
||||||
Function to get unique propId from MySQL database
|
|
||||||
:return: propList
|
|
||||||
'''
|
|
||||||
db = MySQLdb.connect(host="localhost",user="root",passwd="admin",db="consultancy")
|
|
||||||
cur = db.cursor()
|
|
||||||
cur.execute("SELECT DISTINCT property_id "
|
|
||||||
"FROM extractions;")
|
|
||||||
propIds = cur.fetchall()
|
|
||||||
db.close()
|
|
||||||
|
|
||||||
propList = []
|
|
||||||
for propId in propIds:
|
|
||||||
propList.append(propId[0])
|
|
||||||
|
|
||||||
return propList
|
|
||||||
|
|
||||||
def reformatScrapeDates(scrapeDatesIn):
|
|
||||||
'''
|
|
||||||
Reformats the scrapeDates column to a shortened datetime format
|
|
||||||
:param scrapeDatesIn:
|
|
||||||
:return:
|
|
||||||
'''
|
|
||||||
scrapeDates = []
|
|
||||||
for row in scrapeDatesIn:
|
|
||||||
date = datetime.strptime(json.loads(row[0])[0], '%a, %d %b %Y %H:%M:%S %Z').date()
|
|
||||||
str = date.strftime('%Y-%m-%d')
|
|
||||||
scrapeDates.append(str)
|
|
||||||
|
|
||||||
return scrapeDates
|
|
||||||
|
|
||||||
def checkForLostProprty(calendarData):
|
|
||||||
'''
|
|
||||||
Checks if there are "None" Entries in the calendarData meaning they were no longer found
|
|
||||||
:param calendarData:
|
|
||||||
:return: Boolean indicating if there are "None" Entries in the calendarData
|
|
||||||
'''
|
|
||||||
for row in calendarData:
|
|
||||||
if None in row:
|
|
||||||
return True
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
def getMinMaxDate(calendarData):
|
|
||||||
'''
|
|
||||||
Gets the min and max values from a calendar data
|
|
||||||
:param calendarData: get all calendar data from querry
|
|
||||||
:return: the minimal and maximal date
|
|
||||||
'''
|
|
||||||
#minimales und maximales Datum ermitteln
|
|
||||||
fullDateList = []
|
|
||||||
for row in calendarData:
|
|
||||||
tempJson = json.loads(row[0]).keys()
|
|
||||||
for key in tempJson:
|
|
||||||
#print(key)
|
|
||||||
fullDateList.append(datetime.strptime(key, '%Y-%m-%d').date())
|
|
||||||
|
|
||||||
end_dt = max(fullDateList)
|
|
||||||
start_dt = min(fullDateList)
|
|
||||||
delta = timedelta(days=1)
|
|
||||||
HeaderDates = []
|
|
||||||
|
|
||||||
while start_dt <= end_dt:
|
|
||||||
HeaderDates.append(start_dt)
|
|
||||||
start_dt += delta
|
|
||||||
|
|
||||||
return HeaderDates
|
|
||||||
|
|
||||||
|
|
||||||
def creatDataMatrix(HeaderDates, calendarData):
|
|
||||||
'''
|
|
||||||
Creates the data matrix from a calendar data
|
|
||||||
:param HeaderDates: The list of all possible Dates in the dataset is used as the headers
|
|
||||||
:param calendarData: the main information from the sql querry
|
|
||||||
:return: data Matrix with all the dates in the dataset
|
|
||||||
'''
|
|
||||||
data = []
|
|
||||||
for row in calendarData:
|
|
||||||
tempList = [-1] * len(HeaderDates)
|
|
||||||
tempJson = json.loads(row[0])
|
|
||||||
for key in tempJson:
|
|
||||||
date = datetime.strptime(key, '%Y-%m-%d').date()
|
|
||||||
content = tempJson[key]
|
|
||||||
index = [i for i, x in enumerate(HeaderDates) if x == date]
|
|
||||||
tempList[index[0]] = content
|
|
||||||
data.append(tempList)
|
|
||||||
|
|
||||||
return data
|
|
||||||
|
|
||||||
|
|
||||||
def getAccuracy(df, baseLine, compLine):
|
|
||||||
'''
|
|
||||||
Calculates the accuracy of a given dataframe with a given baseLine and compLine
|
|
||||||
:param df:
|
|
||||||
:param baseLine:
|
|
||||||
:param compLine:
|
|
||||||
:return: Accuracy: The percentage of dates that had the same information in both baseLine and compLine
|
|
||||||
'''
|
|
||||||
try:
|
|
||||||
df = df.iloc[[baseLine,compLine]]
|
|
||||||
except IndexError:
|
|
||||||
return -1
|
|
||||||
total = 0
|
|
||||||
noChange = 0
|
|
||||||
first = True
|
|
||||||
for series_name, series in df.items():
|
|
||||||
if first:
|
|
||||||
first = False
|
|
||||||
else:
|
|
||||||
total += 1
|
|
||||||
#print(series_name)
|
|
||||||
if series[baseLine] != -1:
|
|
||||||
if series[compLine] != -1:
|
|
||||||
if series[baseLine] == series[compLine]:
|
|
||||||
noChange += 1
|
|
||||||
|
|
||||||
accuracy = noChange / total
|
|
||||||
return accuracy
|
|
||||||
|
|
||||||
def getMeanAccuracy(accList):
|
|
||||||
'''
|
|
||||||
Get the mean Accuracy of the entire timedelay of one property
|
|
||||||
:param accList: List of accuracy Values of a comparison
|
|
||||||
:return: Average of the accuracy values while ignoring the '-1' values
|
|
||||||
'''
|
|
||||||
out = []
|
|
||||||
for row in accList:
|
|
||||||
row = [x for x in row if x != -1]
|
|
||||||
out.append(np.average(row))
|
|
||||||
return out
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,83 +0,0 @@
|
||||||
from datetime import datetime, timedelta
|
|
||||||
import json
|
|
||||||
|
|
||||||
import MySQLdb #Version 2.2.4
|
|
||||||
import pandas as pd #Version 2.2.3
|
|
||||||
import plotly.express as px #Version 5.24.1
|
|
||||||
|
|
||||||
db = MySQLdb.connect(host="localhost",user="root",passwd="admin",db="consultancy")
|
|
||||||
cur = db.cursor()
|
|
||||||
|
|
||||||
cur.execute("SELECT JSON_EXTRACT(header, '$.Date') "
|
|
||||||
"FROM extractions "
|
|
||||||
"WHERE type='calendar' AND property_id = 200;")
|
|
||||||
dateoutput = cur.fetchall()
|
|
||||||
|
|
||||||
|
|
||||||
cur.execute("SELECT JSON_EXTRACT(body, '$.content.days') "
|
|
||||||
"FROM extractions "
|
|
||||||
"WHERE type='calendar' AND property_id = 200;")
|
|
||||||
|
|
||||||
output = cur.fetchall()
|
|
||||||
db.close()
|
|
||||||
|
|
||||||
#createScrapedate Liste
|
|
||||||
ytickVals = list(range(0, 30, 5))
|
|
||||||
scrapeDates = []
|
|
||||||
#print(dateoutput)
|
|
||||||
for row in dateoutput:
|
|
||||||
date = datetime.strptime(json.loads(row[0])[0], '%a, %d %b %Y %H:%M:%S %Z').date()
|
|
||||||
str = date.strftime('%d/%m/%Y')
|
|
||||||
scrapeDates.append(str)
|
|
||||||
|
|
||||||
#minimales und maximales Datum ermitteln
|
|
||||||
fullDateList = []
|
|
||||||
for row in output:
|
|
||||||
tempJson = json.loads(row[0]).keys()
|
|
||||||
for key in tempJson:
|
|
||||||
#print(key)
|
|
||||||
fullDateList.append(datetime.strptime(key, '%Y-%m-%d').date())
|
|
||||||
|
|
||||||
end_dt = max(fullDateList)
|
|
||||||
start_dt = min(fullDateList)
|
|
||||||
delta = timedelta(days=1)
|
|
||||||
HeaderDates = []
|
|
||||||
|
|
||||||
while start_dt <= end_dt:
|
|
||||||
HeaderDates.append(start_dt)
|
|
||||||
start_dt += delta
|
|
||||||
|
|
||||||
#Create data-Matrix
|
|
||||||
data = []
|
|
||||||
for row in output:
|
|
||||||
tempList = [-1] * len(HeaderDates)
|
|
||||||
tempJson = json.loads(row[0])
|
|
||||||
for key in tempJson:
|
|
||||||
date = datetime.strptime(key, '%Y-%m-%d').date()
|
|
||||||
content = tempJson[key]
|
|
||||||
index = [i for i, x in enumerate(HeaderDates) if x == date]
|
|
||||||
tempList[index[0]] = content
|
|
||||||
data.append(tempList)
|
|
||||||
|
|
||||||
#Transform to Dataframe for Plotly
|
|
||||||
df = pd.DataFrame(data, columns=HeaderDates)
|
|
||||||
|
|
||||||
#Generate Plotly Diagramm
|
|
||||||
colScale = [[0, 'rgb(0, 0, 0)'], [0.33, 'rgb(204, 16, 16)'], [0.66, 'rgb(10, 102, 15)'], [1, 'rgb(17, 184, 26)']]
|
|
||||||
fig = px.imshow(df, color_continuous_scale= colScale)
|
|
||||||
lines = list(range(0,30,1))
|
|
||||||
for i in lines:
|
|
||||||
#fig.add_hline(y=i+0.5, line_color="white")
|
|
||||||
fig.add_hline(y=i+0.5)
|
|
||||||
|
|
||||||
fig.update_layout(yaxis = dict(tickfont = dict(size=50))),
|
|
||||||
fig.update_layout(xaxis = dict(tickfont = dict(size=50)))
|
|
||||||
fig.update_layout(xaxis_title="Verfügbarkeitsdaten Mietobjekt", yaxis_title="Scrapingvorgang")
|
|
||||||
fig.update_xaxes(title_font_size=100, title_font_weight="bold")
|
|
||||||
fig.update_yaxes(title_font_size=100, title_font_weight="bold")
|
|
||||||
fig.update_layout(yaxis = dict(tickmode = 'array',tickvals = ytickVals, ticktext = scrapeDates))
|
|
||||||
fig.update_xaxes(title_standoff = 80)
|
|
||||||
fig.update_yaxes(title_standoff = 80)
|
|
||||||
fig.update_layout(xaxis={'side': 'top'})
|
|
||||||
fig.show()
|
|
||||||
|
|
|
@ -1,58 +0,0 @@
|
||||||
import Data_Analysis as DA
|
|
||||||
import pandas as pd
|
|
||||||
|
|
||||||
accuracy = pd.read_csv(f'results/accMeanDf.csv')
|
|
||||||
|
|
||||||
propData = DA.getPropertyDataFromDB()
|
|
||||||
propData = pd.DataFrame(propData, columns =['property_id', 'region', 'geoLocation'])
|
|
||||||
propData = propData.drop(columns=['geoLocation'])
|
|
||||||
#print(propData)
|
|
||||||
|
|
||||||
merge = pd.merge(propData, accuracy, on="property_id")
|
|
||||||
#print(merge)
|
|
||||||
|
|
||||||
#1 = Heidiland, 2 = Davos, 3 = Engadin 4 = St.Moritz
|
|
||||||
|
|
||||||
heidiAcc = merge[merge['region'] == 1]
|
|
||||||
davosAcc = merge[merge['region'] == 2]
|
|
||||||
EngadAcc = merge[merge['region'] == 3]
|
|
||||||
StMorAcc = merge[merge['region'] == 4]
|
|
||||||
|
|
||||||
|
|
||||||
heidiMean = heidiAcc.mean(axis=0)
|
|
||||||
davosMean = davosAcc.mean(axis=0)
|
|
||||||
EngadMean = EngadAcc.mean(axis=0)
|
|
||||||
StMorMean = StMorAcc.mean(axis=0)
|
|
||||||
|
|
||||||
heidiSDev = heidiAcc.std(axis=0)
|
|
||||||
davosSDev = davosAcc.std(axis=0)
|
|
||||||
EngadSDev = EngadAcc.std(axis=0)
|
|
||||||
StMorSDev = StMorAcc.std(axis=0)
|
|
||||||
|
|
||||||
|
|
||||||
accuracyOverview = pd.DataFrame()
|
|
||||||
|
|
||||||
accuracyOverview.insert(0, "St. Moritz StdDev", StMorSDev, True)
|
|
||||||
accuracyOverview.insert(0, "St. Moritz Mean", StMorMean, True)
|
|
||||||
accuracyOverview.insert(0, "Engadin StdDev", EngadSDev, True)
|
|
||||||
accuracyOverview.insert(0, "Engadin Mean", EngadMean, True)
|
|
||||||
accuracyOverview.insert(0, "Davos StdDev", davosSDev, True)
|
|
||||||
accuracyOverview.insert(0, "Davos Mean", davosMean, True)
|
|
||||||
accuracyOverview.insert(0, "Heidi StdDev", heidiSDev, True)
|
|
||||||
accuracyOverview.insert(0, "Heidi Mean", heidiMean, True)
|
|
||||||
|
|
||||||
|
|
||||||
accuracyOverview.drop(index=accuracyOverview.index[0], axis=0, inplace=True)
|
|
||||||
accuracyOverview.drop(index=accuracyOverview.index[0], axis=0, inplace=True)
|
|
||||||
accuracyOverview.to_csv('results/accuracyOverview.csv', index=True)
|
|
||||||
|
|
||||||
#delete unused DF's
|
|
||||||
del merge, accuracy, propData
|
|
||||||
del heidiAcc, davosAcc, EngadAcc, StMorAcc
|
|
||||||
del heidiMean, davosMean, EngadMean, StMorMean
|
|
||||||
del heidiSDev, davosSDev, EngadSDev, StMorSDev
|
|
||||||
|
|
||||||
|
|
||||||
print(accuracyOverview)
|
|
||||||
|
|
||||||
|
|
|
@ -1,73 +0,0 @@
|
||||||
import pandas as pd
|
|
||||||
import os
|
|
||||||
import re
|
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
def getAccuracy(df, baseLine, compLine):
|
|
||||||
try:
|
|
||||||
df = df.iloc[[baseLine,compLine]]
|
|
||||||
except IndexError:
|
|
||||||
return -1
|
|
||||||
total = 0
|
|
||||||
noChange = 0
|
|
||||||
first = True
|
|
||||||
for series_name, series in df.items():
|
|
||||||
if first:
|
|
||||||
first = False
|
|
||||||
else:
|
|
||||||
total += 1
|
|
||||||
#print(series_name)
|
|
||||||
if series[baseLine] != -1:
|
|
||||||
if series[compLine] != -1:
|
|
||||||
if series[baseLine] == series[compLine]:
|
|
||||||
noChange += 1
|
|
||||||
|
|
||||||
accuracy = noChange / total
|
|
||||||
return accuracy
|
|
||||||
|
|
||||||
def getMeanAccuracy(accList):
|
|
||||||
out = []
|
|
||||||
for row in accList:
|
|
||||||
row = [x for x in row if x != -1]
|
|
||||||
out.append(np.average(row))
|
|
||||||
return out
|
|
||||||
|
|
||||||
deltaList = [1, 2, 10, 20]
|
|
||||||
#1 = 1 Scrape Interval
|
|
||||||
#2 = ca. 1 Woche
|
|
||||||
#10 = 1 Monat (30Tage)
|
|
||||||
#20 = 2 Monate
|
|
||||||
|
|
||||||
|
|
||||||
directory = os.fsencode("dok")
|
|
||||||
|
|
||||||
columnNames = ['property_id', 'timedelay_1', 'timedelay_2','timedelay_10','timedelay_20']
|
|
||||||
accListDf = pd.DataFrame(columns = columnNames)
|
|
||||||
accMeanDf = pd.DataFrame(columns = columnNames)
|
|
||||||
|
|
||||||
|
|
||||||
for file in os.listdir(directory):
|
|
||||||
filename = os.fsdecode(file)
|
|
||||||
if filename.endswith(".csv"):
|
|
||||||
propId = re.findall("\d+", filename)[0]
|
|
||||||
print(propId)
|
|
||||||
df = pd.read_csv(f'dok/{filename}')
|
|
||||||
fullList = []
|
|
||||||
accList = []
|
|
||||||
#Loop though all deltas in the deltaList
|
|
||||||
for delta in deltaList:
|
|
||||||
accList = []
|
|
||||||
#Loop through all Dates as Baseline date
|
|
||||||
for i in range(df.shape[0]):
|
|
||||||
acc = getAccuracy(df, i, i+delta)
|
|
||||||
accList.append(acc)
|
|
||||||
fullList.append(accList)
|
|
||||||
|
|
||||||
|
|
||||||
meanList = getMeanAccuracy(fullList)
|
|
||||||
accListDf = accListDf._append({'property_id': propId, 'timedelay_1': fullList[0], 'timedelay_2': fullList[1], 'timedelay_10': fullList[2], 'timedelay_20': fullList[3]}, ignore_index=True)
|
|
||||||
accMeanDf = accMeanDf._append({'property_id': propId, 'timedelay_1': meanList[0], 'timedelay_2': meanList[1], 'timedelay_10': meanList[2], 'timedelay_20': meanList[3]}, ignore_index=True)
|
|
||||||
|
|
||||||
|
|
||||||
accListDf.to_csv('results/accListDf.csv', index=False)
|
|
||||||
accMeanDf.to_csv('results/accMeanDf.csv', index=False)
|
|
|
@ -1,20 +0,0 @@
|
||||||
import Data_Analysis as DA
|
|
||||||
import csv
|
|
||||||
|
|
||||||
propIds = DA.getuniquePropIdFromDB()
|
|
||||||
|
|
||||||
lostProperties = []
|
|
||||||
|
|
||||||
for propId in propIds:
|
|
||||||
print(propId)
|
|
||||||
scrapeDates, calendarData = DA.getDataFromDB(propId)
|
|
||||||
if DA.checkForLostProprty(calendarData):
|
|
||||||
lostProperties.append(propId)
|
|
||||||
|
|
||||||
print(f"{len(lostProperties)} of {len(propIds)} properties are lost")
|
|
||||||
|
|
||||||
with open('results/allLostProperties', 'w') as f:
|
|
||||||
write = csv.writer(f)
|
|
||||||
write.writerow(lostProperties)
|
|
||||||
|
|
||||||
#Output: 221 of 1552 properties were lost at some point
|
|
|
@ -1,28 +0,0 @@
|
||||||
import Data_Analysis as DA
|
|
||||||
import pandas as pd
|
|
||||||
import os
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
propIds = DA.getuniquePropIdFromDB()
|
|
||||||
|
|
||||||
for propId in propIds:
|
|
||||||
name = f"dok/calendarData_prop{propId}.csv"
|
|
||||||
if not os.path.exists(name):
|
|
||||||
print(propId)
|
|
||||||
scrapeDates, calendarData = DA.getDataFromDB(propId)
|
|
||||||
if DA.checkForLostProprty(calendarData):
|
|
||||||
print(f"Lost Proprty: {propId}")
|
|
||||||
else:
|
|
||||||
scrapeDates = DA.reformatScrapeDates(scrapeDates)
|
|
||||||
HeaderDates = DA.getMinMaxDate(calendarData)
|
|
||||||
data = DA.creatDataMatrix(HeaderDates, calendarData)
|
|
||||||
|
|
||||||
# Transform to Dataframe for Plotly
|
|
||||||
df = pd.DataFrame(data, columns=HeaderDates)
|
|
||||||
df.insert(0, "ScrapeDate", scrapeDates, True)
|
|
||||||
|
|
||||||
df = df.drop(index=0) # Irregulärer Abstand in den Scraping Zeiten (nur 2 Tage)
|
|
||||||
df = df.drop(df.columns[[1, 2]], axis=1)
|
|
||||||
df.to_csv(name, index=False)
|
|
||||||
|
|
|
@ -1,32 +0,0 @@
|
||||||
import Data_Analysis as DA
|
|
||||||
import pandas as pd
|
|
||||||
|
|
||||||
#Alle Scrape Dates auslesen, umformatieren und doppelte Löschen
|
|
||||||
uniqueScrapeDates = DA.getUniqueScrapeDates()
|
|
||||||
uniqueScrapeDates = DA.reformatScrapeDates(uniqueScrapeDates)
|
|
||||||
uniqueScrapeDates= list(dict.fromkeys(uniqueScrapeDates))
|
|
||||||
#print(uniqueScrapeDates)
|
|
||||||
|
|
||||||
#Liste der Listen der properties pro Scrape Datum erstellen
|
|
||||||
fullPropList = []
|
|
||||||
for date in uniqueScrapeDates:
|
|
||||||
propList = []
|
|
||||||
strDate = date
|
|
||||||
properties = DA.getPropsPerScrape(strDate)
|
|
||||||
for prop in properties:
|
|
||||||
propList.append(prop[0])
|
|
||||||
propList = list(dict.fromkeys(propList))
|
|
||||||
fullPropList.append(propList)
|
|
||||||
#print(propList)
|
|
||||||
print(fullPropList)
|
|
||||||
|
|
||||||
#zu DF umwandeln, mit Property ID's in the Spaltennamen und One-Hot-Encoding
|
|
||||||
all_property_ids = sorted(set([item for sublist in fullPropList for item in sublist]))
|
|
||||||
print(all_property_ids)
|
|
||||||
df = pd.DataFrame(0, index=range(len(fullPropList)), columns=all_property_ids)
|
|
||||||
for i, property_list in enumerate(fullPropList):
|
|
||||||
df.loc[i, property_list] = 1
|
|
||||||
|
|
||||||
df.to_csv('results/PropertiesPerScrape.csv', index=True)
|
|
||||||
|
|
||||||
print(df)
|
|
|
@ -1,121 +0,0 @@
|
||||||
from etl.src import data
|
|
||||||
import json
|
|
||||||
import polars as pl
|
|
||||||
from datetime import datetime
|
|
||||||
import matplotlib.pyplot as plt
|
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
'''
|
|
||||||
# Get Data from DB
|
|
||||||
inst = data.load()
|
|
||||||
|
|
||||||
df = inst.extractions_with_region().pl()
|
|
||||||
print(df)
|
|
||||||
|
|
||||||
counter = 0
|
|
||||||
data = []
|
|
||||||
for row in df.iter_rows():
|
|
||||||
property_id = row[1]
|
|
||||||
created_at = row[2].date()
|
|
||||||
dict = {'property_id': property_id, 'created_at': created_at, 'name': row[3]}
|
|
||||||
|
|
||||||
jsonStr = row[0]
|
|
||||||
if jsonStr:
|
|
||||||
calendarDict = json.loads(jsonStr)
|
|
||||||
for key in calendarDict:
|
|
||||||
dict[key] = calendarDict[key]
|
|
||||||
|
|
||||||
data.append(dict)
|
|
||||||
|
|
||||||
dfNew = pl.from_dicts(data)
|
|
||||||
dfNew.write_csv('results/data_quality.csv')
|
|
||||||
print(dfNew)
|
|
||||||
|
|
||||||
'''
|
|
||||||
dfNew = pl.read_csv('results/data_quality.csv')
|
|
||||||
dfNew = dfNew.with_columns(pl.col("created_at").map_elements(lambda x: datetime.strptime(x, "%Y-%m-%d").date()))
|
|
||||||
|
|
||||||
# Create Row Means
|
|
||||||
dfTemp = dfNew
|
|
||||||
# Temporary Remove leading columns but save for later
|
|
||||||
prop = dfTemp.get_column('property_id')
|
|
||||||
dfTemp = dfTemp.drop('property_id')
|
|
||||||
crea = dfTemp.get_column('created_at')
|
|
||||||
dfTemp = dfTemp.drop('created_at')
|
|
||||||
name = dfTemp.get_column('name')
|
|
||||||
dfTemp = dfTemp.drop('name')
|
|
||||||
dfTemp = dfTemp.with_columns(sum=pl.sum_horizontal(dfTemp.columns))
|
|
||||||
sumCol = dfTemp.get_column('sum')
|
|
||||||
|
|
||||||
# Create new DF with only property_id, created_at ,Location name and sum
|
|
||||||
df = pl.DataFrame([prop, crea, name, sumCol])
|
|
||||||
df = df.sort('created_at')
|
|
||||||
|
|
||||||
# Create Full Copy
|
|
||||||
# 0 = Alles
|
|
||||||
# 1 = Heidiland
|
|
||||||
# 2 = Davos
|
|
||||||
# 3 = Engadin
|
|
||||||
# 4 = St. Moritz
|
|
||||||
filterList = ['Alle Regionen', 'Heidiland', 'Davos', 'Engadin', 'St. Moritz']
|
|
||||||
|
|
||||||
filter = 4
|
|
||||||
if filter != 0:
|
|
||||||
df = df.filter(pl.col("name") == filter)
|
|
||||||
|
|
||||||
# Remove Location name
|
|
||||||
df = df.drop('name')
|
|
||||||
|
|
||||||
|
|
||||||
# Get unique property_ids
|
|
||||||
propsIDs = df.unique(subset=["property_id"])
|
|
||||||
propsIDs = propsIDs.get_column("property_id").to_list()
|
|
||||||
propsIDs.sort()
|
|
||||||
|
|
||||||
# create Matrix
|
|
||||||
matrix = []
|
|
||||||
for id in propsIDs:
|
|
||||||
dict = {}
|
|
||||||
temp = df.filter(pl.col("property_id") == id)
|
|
||||||
for row in temp.iter_rows():
|
|
||||||
dict[row[1].strftime('%Y-%m-%d')] = row[2]
|
|
||||||
matrix.append(dict)
|
|
||||||
|
|
||||||
matrix = pl.DataFrame(matrix)
|
|
||||||
dates = matrix.columns
|
|
||||||
matrix = matrix.to_numpy()
|
|
||||||
# normalized
|
|
||||||
matrix = matrix/1111
|
|
||||||
|
|
||||||
|
|
||||||
yRange = range(len(dates))
|
|
||||||
xRange = range(len(propsIDs))
|
|
||||||
matrix = matrix.T
|
|
||||||
plt.imshow(matrix)
|
|
||||||
plt.yticks(yRange[::5], dates[::5])
|
|
||||||
plt.xticks(xRange[::10], propsIDs[::10])
|
|
||||||
plt.title(filterList[filter])
|
|
||||||
plt.xlabel("Property ID")
|
|
||||||
plt.ylabel("Scrape Date")
|
|
||||||
plt.colorbar()
|
|
||||||
plt.tight_layout()
|
|
||||||
|
|
||||||
# Create DiffMatrix
|
|
||||||
diffMatrix = np.zeros((len(matrix)-1, len(matrix[0])))
|
|
||||||
for y in range(len(matrix[0])):
|
|
||||||
for x in range(len(matrix)-1):
|
|
||||||
diffMatrix[x][y] = abs(matrix[x][y] - matrix[x+1][y])
|
|
||||||
|
|
||||||
plt.figure()
|
|
||||||
plt.imshow(diffMatrix, cmap="Reds")
|
|
||||||
plt.yticks(yRange[::5], dates[::5])
|
|
||||||
plt.xticks(xRange[::10], propsIDs[::10])
|
|
||||||
plt.title(filterList[filter])
|
|
||||||
plt.xlabel("Property ID")
|
|
||||||
plt.ylabel("Scrape Date")
|
|
||||||
plt.colorbar()
|
|
||||||
plt.tight_layout()
|
|
||||||
|
|
||||||
plt.show()
|
|
||||||
|
|
||||||
|
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue