Mauro Ordner gelöscht closes #16

main
mmaurostoffel 2025-01-17 15:27:25 +01:00
parent 5b97c7ead2
commit 468ad94430
1356 changed files with 0 additions and 223201 deletions

View File

@ -1,199 +0,0 @@
import MySQLdb
import json
from datetime import datetime, timedelta
import numpy as np
def getPropertyDataFromDB():
db = MySQLdb.connect(host="localhost",user="root",passwd="admin",db="consultancy")
cur = db.cursor()
cur.execute("SELECT id, seed_id, check_data "
"FROM properties ")
propData = cur.fetchall()
db.close()
return propData
def getDataFromDB(propId):
'''
Function to get data from MySQL database filter with the given propId
:return: scrapeDates and calendarData
'''
db = MySQLdb.connect(host="localhost",user="root",passwd="admin",db="consultancy")
cur = db.cursor()
cur.execute("SELECT JSON_EXTRACT(header, '$.Date') "
"FROM extractions "
f"WHERE type='calendar' AND property_id = {propId};")
scrapeDates = cur.fetchall()
cur.execute("SELECT JSON_EXTRACT(body, '$.content.days') "
"FROM extractions "
f"WHERE type='calendar' AND property_id = {propId};")
calendarData = cur.fetchall()
db.close()
return scrapeDates, calendarData
def getUniqueScrapeDates():
db = MySQLdb.connect(host="localhost",user="root",passwd="admin",db="consultancy")
cur = db.cursor()
cur.execute("SELECT JSON_EXTRACT(header, '$.Date') "
"FROM extractions "
f"WHERE type='calendar'")
uniqueScrapeDates = cur.fetchall()
db.close()
return uniqueScrapeDates
def getPropsPerScrape(scrapeDate):
date = datetime.strptime(scrapeDate, '%Y-%m-%d')
end_date = date + timedelta(days=1)
db = MySQLdb.connect(host="localhost",user="root",passwd="admin",db="consultancy")
cur = db.cursor()
cur.execute("SELECT property_id "
"FROM extractions "
f"WHERE type='calendar' AND created_at > '{scrapeDate}' AND created_at < '{str(end_date)}'")
uniqueScrapeDates = cur.fetchall()
db.close()
return uniqueScrapeDates
def getuniquePropIdFromDB():
'''
Function to get unique propId from MySQL database
:return: propList
'''
db = MySQLdb.connect(host="localhost",user="root",passwd="admin",db="consultancy")
cur = db.cursor()
cur.execute("SELECT DISTINCT property_id "
"FROM extractions;")
propIds = cur.fetchall()
db.close()
propList = []
for propId in propIds:
propList.append(propId[0])
return propList
def reformatScrapeDates(scrapeDatesIn):
'''
Reformats the scrapeDates column to a shortened datetime format
:param scrapeDatesIn:
:return:
'''
scrapeDates = []
for row in scrapeDatesIn:
date = datetime.strptime(json.loads(row[0])[0], '%a, %d %b %Y %H:%M:%S %Z').date()
str = date.strftime('%Y-%m-%d')
scrapeDates.append(str)
return scrapeDates
def checkForLostProprty(calendarData):
'''
Checks if there are "None" Entries in the calendarData meaning they were no longer found
:param calendarData:
:return: Boolean indicating if there are "None" Entries in the calendarData
'''
for row in calendarData:
if None in row:
return True
return False
def getMinMaxDate(calendarData):
'''
Gets the min and max values from a calendar data
:param calendarData: get all calendar data from querry
:return: the minimal and maximal date
'''
#minimales und maximales Datum ermitteln
fullDateList = []
for row in calendarData:
tempJson = json.loads(row[0]).keys()
for key in tempJson:
#print(key)
fullDateList.append(datetime.strptime(key, '%Y-%m-%d').date())
end_dt = max(fullDateList)
start_dt = min(fullDateList)
delta = timedelta(days=1)
HeaderDates = []
while start_dt <= end_dt:
HeaderDates.append(start_dt)
start_dt += delta
return HeaderDates
def creatDataMatrix(HeaderDates, calendarData):
'''
Creates the data matrix from a calendar data
:param HeaderDates: The list of all possible Dates in the dataset is used as the headers
:param calendarData: the main information from the sql querry
:return: data Matrix with all the dates in the dataset
'''
data = []
for row in calendarData:
tempList = [-1] * len(HeaderDates)
tempJson = json.loads(row[0])
for key in tempJson:
date = datetime.strptime(key, '%Y-%m-%d').date()
content = tempJson[key]
index = [i for i, x in enumerate(HeaderDates) if x == date]
tempList[index[0]] = content
data.append(tempList)
return data
def getAccuracy(df, baseLine, compLine):
'''
Calculates the accuracy of a given dataframe with a given baseLine and compLine
:param df:
:param baseLine:
:param compLine:
:return: Accuracy: The percentage of dates that had the same information in both baseLine and compLine
'''
try:
df = df.iloc[[baseLine,compLine]]
except IndexError:
return -1
total = 0
noChange = 0
first = True
for series_name, series in df.items():
if first:
first = False
else:
total += 1
#print(series_name)
if series[baseLine] != -1:
if series[compLine] != -1:
if series[baseLine] == series[compLine]:
noChange += 1
accuracy = noChange / total
return accuracy
def getMeanAccuracy(accList):
'''
Get the mean Accuracy of the entire timedelay of one property
:param accList: List of accuracy Values of a comparison
:return: Average of the accuracy values while ignoring the '-1' values
'''
out = []
for row in accList:
row = [x for x in row if x != -1]
out.append(np.average(row))
return out

View File

@ -1,83 +0,0 @@
from datetime import datetime, timedelta
import json
import MySQLdb #Version 2.2.4
import pandas as pd #Version 2.2.3
import plotly.express as px #Version 5.24.1
db = MySQLdb.connect(host="localhost",user="root",passwd="admin",db="consultancy")
cur = db.cursor()
cur.execute("SELECT JSON_EXTRACT(header, '$.Date') "
"FROM extractions "
"WHERE type='calendar' AND property_id = 200;")
dateoutput = cur.fetchall()
cur.execute("SELECT JSON_EXTRACT(body, '$.content.days') "
"FROM extractions "
"WHERE type='calendar' AND property_id = 200;")
output = cur.fetchall()
db.close()
#createScrapedate Liste
ytickVals = list(range(0, 30, 5))
scrapeDates = []
#print(dateoutput)
for row in dateoutput:
date = datetime.strptime(json.loads(row[0])[0], '%a, %d %b %Y %H:%M:%S %Z').date()
str = date.strftime('%d/%m/%Y')
scrapeDates.append(str)
#minimales und maximales Datum ermitteln
fullDateList = []
for row in output:
tempJson = json.loads(row[0]).keys()
for key in tempJson:
#print(key)
fullDateList.append(datetime.strptime(key, '%Y-%m-%d').date())
end_dt = max(fullDateList)
start_dt = min(fullDateList)
delta = timedelta(days=1)
HeaderDates = []
while start_dt <= end_dt:
HeaderDates.append(start_dt)
start_dt += delta
#Create data-Matrix
data = []
for row in output:
tempList = [-1] * len(HeaderDates)
tempJson = json.loads(row[0])
for key in tempJson:
date = datetime.strptime(key, '%Y-%m-%d').date()
content = tempJson[key]
index = [i for i, x in enumerate(HeaderDates) if x == date]
tempList[index[0]] = content
data.append(tempList)
#Transform to Dataframe for Plotly
df = pd.DataFrame(data, columns=HeaderDates)
#Generate Plotly Diagramm
colScale = [[0, 'rgb(0, 0, 0)'], [0.33, 'rgb(204, 16, 16)'], [0.66, 'rgb(10, 102, 15)'], [1, 'rgb(17, 184, 26)']]
fig = px.imshow(df, color_continuous_scale= colScale)
lines = list(range(0,30,1))
for i in lines:
#fig.add_hline(y=i+0.5, line_color="white")
fig.add_hline(y=i+0.5)
fig.update_layout(yaxis = dict(tickfont = dict(size=50))),
fig.update_layout(xaxis = dict(tickfont = dict(size=50)))
fig.update_layout(xaxis_title="Verfügbarkeitsdaten Mietobjekt", yaxis_title="Scrapingvorgang")
fig.update_xaxes(title_font_size=100, title_font_weight="bold")
fig.update_yaxes(title_font_size=100, title_font_weight="bold")
fig.update_layout(yaxis = dict(tickmode = 'array',tickvals = ytickVals, ticktext = scrapeDates))
fig.update_xaxes(title_standoff = 80)
fig.update_yaxes(title_standoff = 80)
fig.update_layout(xaxis={'side': 'top'})
fig.show()

View File

@ -1,58 +0,0 @@
import Data_Analysis as DA
import pandas as pd
accuracy = pd.read_csv(f'results/accMeanDf.csv')
propData = DA.getPropertyDataFromDB()
propData = pd.DataFrame(propData, columns =['property_id', 'region', 'geoLocation'])
propData = propData.drop(columns=['geoLocation'])
#print(propData)
merge = pd.merge(propData, accuracy, on="property_id")
#print(merge)
#1 = Heidiland, 2 = Davos, 3 = Engadin 4 = St.Moritz
heidiAcc = merge[merge['region'] == 1]
davosAcc = merge[merge['region'] == 2]
EngadAcc = merge[merge['region'] == 3]
StMorAcc = merge[merge['region'] == 4]
heidiMean = heidiAcc.mean(axis=0)
davosMean = davosAcc.mean(axis=0)
EngadMean = EngadAcc.mean(axis=0)
StMorMean = StMorAcc.mean(axis=0)
heidiSDev = heidiAcc.std(axis=0)
davosSDev = davosAcc.std(axis=0)
EngadSDev = EngadAcc.std(axis=0)
StMorSDev = StMorAcc.std(axis=0)
accuracyOverview = pd.DataFrame()
accuracyOverview.insert(0, "St. Moritz StdDev", StMorSDev, True)
accuracyOverview.insert(0, "St. Moritz Mean", StMorMean, True)
accuracyOverview.insert(0, "Engadin StdDev", EngadSDev, True)
accuracyOverview.insert(0, "Engadin Mean", EngadMean, True)
accuracyOverview.insert(0, "Davos StdDev", davosSDev, True)
accuracyOverview.insert(0, "Davos Mean", davosMean, True)
accuracyOverview.insert(0, "Heidi StdDev", heidiSDev, True)
accuracyOverview.insert(0, "Heidi Mean", heidiMean, True)
accuracyOverview.drop(index=accuracyOverview.index[0], axis=0, inplace=True)
accuracyOverview.drop(index=accuracyOverview.index[0], axis=0, inplace=True)
accuracyOverview.to_csv('results/accuracyOverview.csv', index=True)
#delete unused DF's
del merge, accuracy, propData
del heidiAcc, davosAcc, EngadAcc, StMorAcc
del heidiMean, davosMean, EngadMean, StMorMean
del heidiSDev, davosSDev, EngadSDev, StMorSDev
print(accuracyOverview)

View File

@ -1,73 +0,0 @@
import pandas as pd
import os
import re
import numpy as np
def getAccuracy(df, baseLine, compLine):
try:
df = df.iloc[[baseLine,compLine]]
except IndexError:
return -1
total = 0
noChange = 0
first = True
for series_name, series in df.items():
if first:
first = False
else:
total += 1
#print(series_name)
if series[baseLine] != -1:
if series[compLine] != -1:
if series[baseLine] == series[compLine]:
noChange += 1
accuracy = noChange / total
return accuracy
def getMeanAccuracy(accList):
out = []
for row in accList:
row = [x for x in row if x != -1]
out.append(np.average(row))
return out
deltaList = [1, 2, 10, 20]
#1 = 1 Scrape Interval
#2 = ca. 1 Woche
#10 = 1 Monat (30Tage)
#20 = 2 Monate
directory = os.fsencode("dok")
columnNames = ['property_id', 'timedelay_1', 'timedelay_2','timedelay_10','timedelay_20']
accListDf = pd.DataFrame(columns = columnNames)
accMeanDf = pd.DataFrame(columns = columnNames)
for file in os.listdir(directory):
filename = os.fsdecode(file)
if filename.endswith(".csv"):
propId = re.findall("\d+", filename)[0]
print(propId)
df = pd.read_csv(f'dok/{filename}')
fullList = []
accList = []
#Loop though all deltas in the deltaList
for delta in deltaList:
accList = []
#Loop through all Dates as Baseline date
for i in range(df.shape[0]):
acc = getAccuracy(df, i, i+delta)
accList.append(acc)
fullList.append(accList)
meanList = getMeanAccuracy(fullList)
accListDf = accListDf._append({'property_id': propId, 'timedelay_1': fullList[0], 'timedelay_2': fullList[1], 'timedelay_10': fullList[2], 'timedelay_20': fullList[3]}, ignore_index=True)
accMeanDf = accMeanDf._append({'property_id': propId, 'timedelay_1': meanList[0], 'timedelay_2': meanList[1], 'timedelay_10': meanList[2], 'timedelay_20': meanList[3]}, ignore_index=True)
accListDf.to_csv('results/accListDf.csv', index=False)
accMeanDf.to_csv('results/accMeanDf.csv', index=False)

View File

@ -1,20 +0,0 @@
import Data_Analysis as DA
import csv
propIds = DA.getuniquePropIdFromDB()
lostProperties = []
for propId in propIds:
print(propId)
scrapeDates, calendarData = DA.getDataFromDB(propId)
if DA.checkForLostProprty(calendarData):
lostProperties.append(propId)
print(f"{len(lostProperties)} of {len(propIds)} properties are lost")
with open('results/allLostProperties', 'w') as f:
write = csv.writer(f)
write.writerow(lostProperties)
#Output: 221 of 1552 properties were lost at some point

View File

@ -1,28 +0,0 @@
import Data_Analysis as DA
import pandas as pd
import os
propIds = DA.getuniquePropIdFromDB()
for propId in propIds:
name = f"dok/calendarData_prop{propId}.csv"
if not os.path.exists(name):
print(propId)
scrapeDates, calendarData = DA.getDataFromDB(propId)
if DA.checkForLostProprty(calendarData):
print(f"Lost Proprty: {propId}")
else:
scrapeDates = DA.reformatScrapeDates(scrapeDates)
HeaderDates = DA.getMinMaxDate(calendarData)
data = DA.creatDataMatrix(HeaderDates, calendarData)
# Transform to Dataframe for Plotly
df = pd.DataFrame(data, columns=HeaderDates)
df.insert(0, "ScrapeDate", scrapeDates, True)
df = df.drop(index=0) # Irregulärer Abstand in den Scraping Zeiten (nur 2 Tage)
df = df.drop(df.columns[[1, 2]], axis=1)
df.to_csv(name, index=False)

View File

@ -1,32 +0,0 @@
import Data_Analysis as DA
import pandas as pd
#Alle Scrape Dates auslesen, umformatieren und doppelte Löschen
uniqueScrapeDates = DA.getUniqueScrapeDates()
uniqueScrapeDates = DA.reformatScrapeDates(uniqueScrapeDates)
uniqueScrapeDates= list(dict.fromkeys(uniqueScrapeDates))
#print(uniqueScrapeDates)
#Liste der Listen der properties pro Scrape Datum erstellen
fullPropList = []
for date in uniqueScrapeDates:
propList = []
strDate = date
properties = DA.getPropsPerScrape(strDate)
for prop in properties:
propList.append(prop[0])
propList = list(dict.fromkeys(propList))
fullPropList.append(propList)
#print(propList)
print(fullPropList)
#zu DF umwandeln, mit Property ID's in the Spaltennamen und One-Hot-Encoding
all_property_ids = sorted(set([item for sublist in fullPropList for item in sublist]))
print(all_property_ids)
df = pd.DataFrame(0, index=range(len(fullPropList)), columns=all_property_ids)
for i, property_list in enumerate(fullPropList):
df.loc[i, property_list] = 1
df.to_csv('results/PropertiesPerScrape.csv', index=True)
print(df)

View File

@ -1,121 +0,0 @@
from etl.src import data
import json
import polars as pl
from datetime import datetime
import matplotlib.pyplot as plt
import numpy as np
'''
# Get Data from DB
inst = data.load()
df = inst.extractions_with_region().pl()
print(df)
counter = 0
data = []
for row in df.iter_rows():
property_id = row[1]
created_at = row[2].date()
dict = {'property_id': property_id, 'created_at': created_at, 'name': row[3]}
jsonStr = row[0]
if jsonStr:
calendarDict = json.loads(jsonStr)
for key in calendarDict:
dict[key] = calendarDict[key]
data.append(dict)
dfNew = pl.from_dicts(data)
dfNew.write_csv('results/data_quality.csv')
print(dfNew)
'''
dfNew = pl.read_csv('results/data_quality.csv')
dfNew = dfNew.with_columns(pl.col("created_at").map_elements(lambda x: datetime.strptime(x, "%Y-%m-%d").date()))
# Create Row Means
dfTemp = dfNew
# Temporary Remove leading columns but save for later
prop = dfTemp.get_column('property_id')
dfTemp = dfTemp.drop('property_id')
crea = dfTemp.get_column('created_at')
dfTemp = dfTemp.drop('created_at')
name = dfTemp.get_column('name')
dfTemp = dfTemp.drop('name')
dfTemp = dfTemp.with_columns(sum=pl.sum_horizontal(dfTemp.columns))
sumCol = dfTemp.get_column('sum')
# Create new DF with only property_id, created_at ,Location name and sum
df = pl.DataFrame([prop, crea, name, sumCol])
df = df.sort('created_at')
# Create Full Copy
# 0 = Alles
# 1 = Heidiland
# 2 = Davos
# 3 = Engadin
# 4 = St. Moritz
filterList = ['Alle Regionen', 'Heidiland', 'Davos', 'Engadin', 'St. Moritz']
filter = 4
if filter != 0:
df = df.filter(pl.col("name") == filter)
# Remove Location name
df = df.drop('name')
# Get unique property_ids
propsIDs = df.unique(subset=["property_id"])
propsIDs = propsIDs.get_column("property_id").to_list()
propsIDs.sort()
# create Matrix
matrix = []
for id in propsIDs:
dict = {}
temp = df.filter(pl.col("property_id") == id)
for row in temp.iter_rows():
dict[row[1].strftime('%Y-%m-%d')] = row[2]
matrix.append(dict)
matrix = pl.DataFrame(matrix)
dates = matrix.columns
matrix = matrix.to_numpy()
# normalized
matrix = matrix/1111
yRange = range(len(dates))
xRange = range(len(propsIDs))
matrix = matrix.T
plt.imshow(matrix)
plt.yticks(yRange[::5], dates[::5])
plt.xticks(xRange[::10], propsIDs[::10])
plt.title(filterList[filter])
plt.xlabel("Property ID")
plt.ylabel("Scrape Date")
plt.colorbar()
plt.tight_layout()
# Create DiffMatrix
diffMatrix = np.zeros((len(matrix)-1, len(matrix[0])))
for y in range(len(matrix[0])):
for x in range(len(matrix)-1):
diffMatrix[x][y] = abs(matrix[x][y] - matrix[x+1][y])
plt.figure()
plt.imshow(diffMatrix, cmap="Reds")
plt.yticks(yRange[::5], dates[::5])
plt.xticks(xRange[::10], propsIDs[::10])
plt.title(filterList[filter])
plt.xlabel("Property ID")
plt.ylabel("Scrape Date")
plt.colorbar()
plt.tight_layout()
plt.show()

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

Some files were not shown because too many files have changed in this diff Show More