import MySQLdb
import json
from datetime import datetime, timedelta
import numpy as np
def getPropertyDataFromDB():
db = MySQLdb.connect(host="localhost",user="root",passwd="admin",db="consultancy")
cur = db.cursor()
cur.execute("SELECT id, seed_id, check_data "
"FROM properties ")
propData = cur.fetchall()
return propData
def getDataFromDB(propId):
Function to get data from MySQL database filter with the given propId
:return: scrapeDates and calendarData
db = MySQLdb.connect(host="localhost",user="root",passwd="admin",db="consultancy")
cur = db.cursor()
cur.execute("SELECT JSON_EXTRACT(header, '$.Date') "
"FROM extractions "
f"WHERE type='calendar' AND property_id = {propId};")
scrapeDates = cur.fetchall()
cur.execute("SELECT JSON_EXTRACT(body, '$.content.days') "
"FROM extractions "
f"WHERE type='calendar' AND property_id = {propId};")
calendarData = cur.fetchall()
return scrapeDates, calendarData
def getUniqueScrapeDates():
db = MySQLdb.connect(host="localhost",user="root",passwd="admin",db="consultancy")
cur = db.cursor()
cur.execute("SELECT JSON_EXTRACT(header, '$.Date') "
"FROM extractions "
f"WHERE type='calendar'")
uniqueScrapeDates = cur.fetchall()
return uniqueScrapeDates
def getPropsPerScrape(scrapeDate):
date = datetime.strptime(scrapeDate, '%Y-%m-%d')
end_date = date + timedelta(days=1)
db = MySQLdb.connect(host="localhost",user="root",passwd="admin",db="consultancy")
cur = db.cursor()
cur.execute("SELECT property_id "
"FROM extractions "
f"WHERE type='calendar' AND created_at > '{scrapeDate}' AND created_at < '{str(end_date)}'")
uniqueScrapeDates = cur.fetchall()
return uniqueScrapeDates
def getuniquePropIdFromDB():
Function to get unique propId from MySQL database
:return: propList
db = MySQLdb.connect(host="localhost",user="root",passwd="admin",db="consultancy")
cur = db.cursor()
cur.execute("SELECT DISTINCT property_id "
"FROM extractions;")
propIds = cur.fetchall()
propList = []
for propId in propIds:
return propList
def reformatScrapeDates(scrapeDatesIn):
Reformats the scrapeDates column to a shortened datetime format
:param scrapeDatesIn:
scrapeDates = []
for row in scrapeDatesIn:
date = datetime.strptime(json.loads(row[0])[0], '%a, %d %b %Y %H:%M:%S %Z').date()
str = date.strftime('%Y-%m-%d')
return scrapeDates
def checkForLostProprty(calendarData):
Checks if there are "None" Entries in the calendarData meaning they were no longer found
:param calendarData:
:return: Boolean indicating if there are "None" Entries in the calendarData
for row in calendarData:
if None in row:
return True
return False
def getMinMaxDate(calendarData):
Gets the min and max values from a calendar data
:param calendarData: get all calendar data from querry
:return: the minimal and maximal date
#minimales und maximales Datum ermitteln
fullDateList = []
for row in calendarData:
tempJson = json.loads(row[0]).keys()
for key in tempJson:
fullDateList.append(datetime.strptime(key, '%Y-%m-%d').date())
end_dt = max(fullDateList)
start_dt = min(fullDateList)
delta = timedelta(days=1)
HeaderDates = []
while start_dt <= end_dt:
start_dt += delta
return HeaderDates
def creatDataMatrix(HeaderDates, calendarData):
Creates the data matrix from a calendar data
:param HeaderDates: The list of all possible Dates in the dataset is used as the headers
:param calendarData: the main information from the sql querry
:return: data Matrix with all the dates in the dataset
data = []
for row in calendarData:
tempList = [-1] * len(HeaderDates)
tempJson = json.loads(row[0])
for key in tempJson:
date = datetime.strptime(key, '%Y-%m-%d').date()
content = tempJson[key]
index = [i for i, x in enumerate(HeaderDates) if x == date]
tempList[index[0]] = content
return data
def getAccuracy(df, baseLine, compLine):
Calculates the accuracy of a given dataframe with a given baseLine and compLine
:param df:
:param baseLine:
:param compLine:
:return: Accuracy: The percentage of dates that had the same information in both baseLine and compLine
df = df.iloc[[baseLine,compLine]]
except IndexError:
return -1
total = 0
noChange = 0
first = True
for series_name, series in df.items():
if first:
first = False
total += 1
if series[baseLine] != -1:
if series[compLine] != -1:
if series[baseLine] == series[compLine]:
noChange += 1
accuracy = noChange / total
return accuracy
def getMeanAccuracy(accList):
Get the mean Accuracy of the entire timedelay of one property
:param accList: List of accuracy Values of a comparison
:return: Average of the accuracy values while ignoring the '-1' values
out = []
for row in accList:
row = [x for x in row if x != -1]
return out

@ -1,83 +0,0 @@
from datetime import datetime, timedelta
import json
import MySQLdb #Version 2.2.4
import pandas as pd #Version 2.2.3
import as px #Version 5.24.1
db = MySQLdb.connect(host="localhost",user="root",passwd="admin",db="consultancy")
cur = db.cursor()
cur.execute("SELECT JSON_EXTRACT(header, '$.Date') "
"FROM extractions "
"WHERE type='calendar' AND property_id = 200;")
dateoutput = cur.fetchall()
cur.execute("SELECT JSON_EXTRACT(body, '$.content.days') "
"FROM extractions "
"WHERE type='calendar' AND property_id = 200;")
output = cur.fetchall()
#createScrapedate Liste
ytickVals = list(range(0, 30, 5))
scrapeDates = []
for row in dateoutput:
date = datetime.strptime(json.loads(row[0])[0], '%a, %d %b %Y %H:%M:%S %Z').date()
str = date.strftime('%d/%m/%Y')
#minimales und maximales Datum ermitteln
fullDateList = []
for row in output:
tempJson = json.loads(row[0]).keys()
for key in tempJson:
fullDateList.append(datetime.strptime(key, '%Y-%m-%d').date())
end_dt = max(fullDateList)
start_dt = min(fullDateList)
delta = timedelta(days=1)
HeaderDates = []
while start_dt <= end_dt:
start_dt += delta
#Create data-Matrix
data = []
for row in output:
tempList = [-1] * len(HeaderDates)
tempJson = json.loads(row[0])
for key in tempJson:
date = datetime.strptime(key, '%Y-%m-%d').date()
content = tempJson[key]
index = [i for i, x in enumerate(HeaderDates) if x == date]
tempList[index[0]] = content
#Transform to Dataframe for Plotly
df = pd.DataFrame(data, columns=HeaderDates)
#Generate Plotly Diagramm
colScale = [[0, 'rgb(0, 0, 0)'], [0.33, 'rgb(204, 16, 16)'], [0.66, 'rgb(10, 102, 15)'], [1, 'rgb(17, 184, 26)']]
fig = px.imshow(df, color_continuous_scale= colScale)
lines = list(range(0,30,1))
for i in lines:
#fig.add_hline(y=i+0.5, line_color="white")
fig.update_layout(yaxis = dict(tickfont = dict(size=50))),
fig.update_layout(xaxis = dict(tickfont = dict(size=50)))
fig.update_layout(xaxis_title="Verfügbarkeitsdaten Mietobjekt", yaxis_title="Scrapingvorgang")
fig.update_xaxes(title_font_size=100, title_font_weight="bold")
fig.update_yaxes(title_font_size=100, title_font_weight="bold")
fig.update_layout(yaxis = dict(tickmode = 'array',tickvals = ytickVals, ticktext = scrapeDates))
fig.update_xaxes(title_standoff = 80)
fig.update_yaxes(title_standoff = 80)
fig.update_layout(xaxis={'side': 'top'})

@ -1,58 +0,0 @@
import Data_Analysis as DA
import pandas as pd
accuracy = pd.read_csv(f'results/accMeanDf.csv')
propData = DA.getPropertyDataFromDB()
propData = pd.DataFrame(propData, columns =['property_id', 'region', 'geoLocation'])
propData = propData.drop(columns=['geoLocation'])
merge = pd.merge(propData, accuracy, on="property_id")
#1 = Heidiland, 2 = Davos, 3 = Engadin 4 = St.Moritz
heidiAcc = merge[merge['region'] == 1]
davosAcc = merge[merge['region'] == 2]
EngadAcc = merge[merge['region'] == 3]
StMorAcc = merge[merge['region'] == 4]
heidiMean = heidiAcc.mean(axis=0)
davosMean = davosAcc.mean(axis=0)
EngadMean = EngadAcc.mean(axis=0)
StMorMean = StMorAcc.mean(axis=0)
heidiSDev = heidiAcc.std(axis=0)
davosSDev = davosAcc.std(axis=0)
EngadSDev = EngadAcc.std(axis=0)
StMorSDev = StMorAcc.std(axis=0)
accuracyOverview = pd.DataFrame()
accuracyOverview.insert(0, "St. Moritz StdDev", StMorSDev, True)
accuracyOverview.insert(0, "St. Moritz Mean", StMorMean, True)
accuracyOverview.insert(0, "Engadin StdDev", EngadSDev, True)
accuracyOverview.insert(0, "Engadin Mean", EngadMean, True)
accuracyOverview.insert(0, "Davos StdDev", davosSDev, True)
accuracyOverview.insert(0, "Davos Mean", davosMean, True)
accuracyOverview.insert(0, "Heidi StdDev", heidiSDev, True)
accuracyOverview.insert(0, "Heidi Mean", heidiMean, True)
accuracyOverview.drop(index=accuracyOverview.index[0], axis=0, inplace=True)
accuracyOverview.drop(index=accuracyOverview.index[0], axis=0, inplace=True)
accuracyOverview.to_csv('results/accuracyOverview.csv', index=True)
#delete unused DF's
del merge, accuracy, propData
del heidiAcc, davosAcc, EngadAcc, StMorAcc
del heidiMean, davosMean, EngadMean, StMorMean
del heidiSDev, davosSDev, EngadSDev, StMorSDev

@ -1,73 +0,0 @@
import pandas as pd
import os
import re
import numpy as np
def getAccuracy(df, baseLine, compLine):
df = df.iloc[[baseLine,compLine]]
except IndexError:
return -1
total = 0
noChange = 0
first = True
for series_name, series in df.items():
if first:
first = False
total += 1
if series[baseLine] != -1:
if series[compLine] != -1:
if series[baseLine] == series[compLine]:
noChange += 1
accuracy = noChange / total
return accuracy
def getMeanAccuracy(accList):
out = []
for row in accList:
row = [x for x in row if x != -1]
return out
deltaList = [1, 2, 10, 20]
#1 = 1 Scrape Interval
#2 = ca. 1 Woche
#10 = 1 Monat (30Tage)
#20 = 2 Monate
directory = os.fsencode("dok")
columnNames = ['property_id', 'timedelay_1', 'timedelay_2','timedelay_10','timedelay_20']
accListDf = pd.DataFrame(columns = columnNames)
accMeanDf = pd.DataFrame(columns = columnNames)
for file in os.listdir(directory):
filename = os.fsdecode(file)
if filename.endswith(".csv"):
propId = re.findall("\d+", filename)[0]
df = pd.read_csv(f'dok/{filename}')
fullList = []
accList = []
#Loop though all deltas in the deltaList
for delta in deltaList:
accList = []
#Loop through all Dates as Baseline date
for i in range(df.shape[0]):
acc = getAccuracy(df, i, i+delta)
meanList = getMeanAccuracy(fullList)
accListDf = accListDf._append({'property_id': propId, 'timedelay_1': fullList[0], 'timedelay_2': fullList[1], 'timedelay_10': fullList[2], 'timedelay_20': fullList[3]}, ignore_index=True)
accMeanDf = accMeanDf._append({'property_id': propId, 'timedelay_1': meanList[0], 'timedelay_2': meanList[1], 'timedelay_10': meanList[2], 'timedelay_20': meanList[3]}, ignore_index=True)
accListDf.to_csv('results/accListDf.csv', index=False)
accMeanDf.to_csv('results/accMeanDf.csv', index=False)

@ -1,20 +0,0 @@
import Data_Analysis as DA
import csv
propIds = DA.getuniquePropIdFromDB()
lostProperties = []
for propId in propIds:
scrapeDates, calendarData = DA.getDataFromDB(propId)
if DA.checkForLostProprty(calendarData):
print(f"{len(lostProperties)} of {len(propIds)} properties are lost")
with open('results/allLostProperties', 'w') as f:
write = csv.writer(f)
#Output: 221 of 1552 properties were lost at some point

@ -1,28 +0,0 @@
import Data_Analysis as DA
import pandas as pd
import os
propIds = DA.getuniquePropIdFromDB()
for propId in propIds:
name = f"dok/calendarData_prop{propId}.csv"
if not os.path.exists(name):
scrapeDates, calendarData = DA.getDataFromDB(propId)
if DA.checkForLostProprty(calendarData):
print(f"Lost Proprty: {propId}")
scrapeDates = DA.reformatScrapeDates(scrapeDates)
HeaderDates = DA.getMinMaxDate(calendarData)
data = DA.creatDataMatrix(HeaderDates, calendarData)
# Transform to Dataframe for Plotly
df = pd.DataFrame(data, columns=HeaderDates)
df.insert(0, "ScrapeDate", scrapeDates, True)
df = df.drop(index=0) # Irregulärer Abstand in den Scraping Zeiten (nur 2 Tage)
df = df.drop(df.columns[[1, 2]], axis=1)
df.to_csv(name, index=False)

@ -1,32 +0,0 @@
import Data_Analysis as DA
import pandas as pd
#Alle Scrape Dates auslesen, umformatieren und doppelte Löschen
uniqueScrapeDates = DA.getUniqueScrapeDates()
uniqueScrapeDates = DA.reformatScrapeDates(uniqueScrapeDates)
uniqueScrapeDates= list(dict.fromkeys(uniqueScrapeDates))
#Liste der Listen der properties pro Scrape Datum erstellen
fullPropList = []
for date in uniqueScrapeDates:
propList = []
strDate = date
properties = DA.getPropsPerScrape(strDate)
for prop in properties:
propList = list(dict.fromkeys(propList))
#zu DF umwandeln, mit Property ID's in the Spaltennamen und One-Hot-Encoding
all_property_ids = sorted(set([item for sublist in fullPropList for item in sublist]))
df = pd.DataFrame(0, index=range(len(fullPropList)), columns=all_property_ids)
for i, property_list in enumerate(fullPropList):
df.loc[i, property_list] = 1
df.to_csv('results/PropertiesPerScrape.csv', index=True)

@ -1,121 +0,0 @@
from etl.src import data
import json
import polars as pl
from datetime import datetime
import matplotlib.pyplot as plt
import numpy as np
# Get Data from DB
inst = data.load()
df = inst.extractions_with_region().pl()
counter = 0
data = []
for row in df.iter_rows():
property_id = row[1]
created_at = row[2].date()
dict = {'property_id': property_id, 'created_at': created_at, 'name': row[3]}
jsonStr = row[0]
if jsonStr:
calendarDict = json.loads(jsonStr)
for key in calendarDict:
dict[key] = calendarDict[key]
dfNew = pl.from_dicts(data)
dfNew = pl.read_csv('results/data_quality.csv')
dfNew = dfNew.with_columns(pl.col("created_at").map_elements(lambda x: datetime.strptime(x, "%Y-%m-%d").date()))
# Create Row Means
dfTemp = dfNew
# Temporary Remove leading columns but save for later
prop = dfTemp.get_column('property_id')
dfTemp = dfTemp.drop('property_id')
crea = dfTemp.get_column('created_at')
dfTemp = dfTemp.drop('created_at')
name = dfTemp.get_column('name')
dfTemp = dfTemp.drop('name')
dfTemp = dfTemp.with_columns(sum=pl.sum_horizontal(dfTemp.columns))
sumCol = dfTemp.get_column('sum')
# Create new DF with only property_id, created_at ,Location name and sum
df = pl.DataFrame([prop, crea, name, sumCol])
df = df.sort('created_at')
# Create Full Copy
# 0 = Alles
# 1 = Heidiland
# 2 = Davos
# 3 = Engadin
# 4 = St. Moritz
filterList = ['Alle Regionen', 'Heidiland', 'Davos', 'Engadin', 'St. Moritz']
filter = 4
if filter != 0:
df = df.filter(pl.col("name") == filter)
# Remove Location name
df = df.drop('name')
# Get unique property_ids
propsIDs = df.unique(subset=["property_id"])
propsIDs = propsIDs.get_column("property_id").to_list()
# create Matrix
matrix = []
for id in propsIDs:
dict = {}
temp = df.filter(pl.col("property_id") == id)
for row in temp.iter_rows():
dict[row[1].strftime('%Y-%m-%d')] = row[2]
matrix = pl.DataFrame(matrix)
dates = matrix.columns
matrix = matrix.to_numpy()
# normalized
matrix = matrix/1111
yRange = range(len(dates))
xRange = range(len(propsIDs))
matrix = matrix.T
plt.yticks(yRange[::5], dates[::5])
plt.xticks(xRange[::10], propsIDs[::10])
plt.xlabel("Property ID")
plt.ylabel("Scrape Date")
# Create DiffMatrix
diffMatrix = np.zeros((len(matrix)-1, len(matrix[0])))
for y in range(len(matrix[0])):
for x in range(len(matrix)-1):
diffMatrix[x][y] = abs(matrix[x][y] - matrix[x+1][y])
plt.imshow(diffMatrix, cmap="Reds")
plt.yticks(yRange[::5], dates[::5])
plt.xticks(xRange[::10], propsIDs[::10])
plt.xlabel("Property ID")
plt.ylabel("Scrape Date")

