ETL update

Untersuchungen:
- Welche Properties gibt es, die nicht mehr gefunden werden
- Wie genau sind die Daten 3 Tage, 1 Woche, 1 Monat und 2 Monate im voraus

Alle Relevanten Resultate liegen im results Ordner
refactor-to-mono
mmaurostoffel 2024-10-19 20:25:30 +02:00
parent fde2f7ffb7
commit c7d58c2b23
1340 changed files with 52807 additions and 11 deletions

View File

@ -1,14 +1,157 @@
import MySQLdb import MySQLdb
import json
from datetime import datetime, timedelta
import numpy as np
def getDataFromDB(propId):
'''
Function to get data from MySQL database filter with the given propId
:return: scrapeDates and calendarData
'''
db = MySQLdb.connect(host="localhost",user="root",passwd="admin",db="consultancy") db = MySQLdb.connect(host="localhost",user="root",passwd="admin",db="consultancy")
cur = db.cursor() cur = db.cursor()
cur.execute("SELECT JSON_EXTRACT(header, '$.Date') " cur.execute("SELECT JSON_EXTRACT(header, '$.Date') "
"FROM extractions ") "FROM extractions "
dateoutput = cur.fetchall() f"WHERE type='calendar' AND property_id = {propId};")
scrapeDates = cur.fetchall()
cur.execute("SELECT JSON_EXTRACT(body, '$.content.days') "
"FROM extractions "
f"WHERE type='calendar' AND property_id = {propId};")
calendarData = cur.fetchall()
db.close() db.close()
print(dateoutput) return scrapeDates, calendarData
def getuniquePropIdFromDB():
'''
Function to get unique propId from MySQL database
:return: propList
'''
db = MySQLdb.connect(host="localhost",user="root",passwd="admin",db="consultancy")
cur = db.cursor()
cur.execute("SELECT DISTINCT property_id "
"FROM extractions;")
propIds = cur.fetchall()
db.close()
propList = []
for propId in propIds:
propList.append(propId[0])
return propList
def reformatScrapeDates(scrapeDatesIn):
'''
Reformats the scrapeDates column to a shortened datetime format
:param scrapeDatesIn:
:return:
'''
scrapeDates = []
for row in scrapeDatesIn:
date = datetime.strptime(json.loads(row[0])[0], '%a, %d %b %Y %H:%M:%S %Z').date()
str = date.strftime('%Y-%m-%d')
scrapeDates.append(str)
return scrapeDates
def checkForLostProprty(calendarData):
'''
Checks if there are "None" Entries in the calendarData meaning they were no longer found
:param calendarData:
:return: Boolean indicating if there are "None" Entries in the calendarData
'''
for row in calendarData:
if None in row:
return True
return False
def getMinMaxDate(calendarData):
'''
Gets the min and max values from a calendar data
:param calendarData: get all calendar data from querry
:return: the minimal and maximal date
'''
#minimales und maximales Datum ermitteln
fullDateList = []
for row in calendarData:
tempJson = json.loads(row[0]).keys()
for key in tempJson:
#print(key)
fullDateList.append(datetime.strptime(key, '%Y-%m-%d').date())
end_dt = max(fullDateList)
start_dt = min(fullDateList)
delta = timedelta(days=1)
HeaderDates = []
while start_dt <= end_dt:
HeaderDates.append(start_dt)
start_dt += delta
return HeaderDates
def creatDataMatrix(HeaderDates, calendarData):
'''
Creates the data matrix from a calendar data
:param HeaderDates: The list of all possible Dates in the dataset is used as the headers
:param calendarData: the main information from the sql querry
:return: data Matrix with all the dates in the dataset
'''
data = []
for row in calendarData:
tempList = [-1] * len(HeaderDates)
tempJson = json.loads(row[0])
for key in tempJson:
date = datetime.strptime(key, '%Y-%m-%d').date()
content = tempJson[key]
index = [i for i, x in enumerate(HeaderDates) if x == date]
tempList[index[0]] = content
data.append(tempList)
return data
def getAccuracy(df, baseLine, compLine):
'''
Calculates the accuracy of a given dataframe with a given baseLine and compLine
:param df:
:param baseLine:
:param compLine:
:return: Accuracy: The percentage of dates that had the same information in both baseLine and compLine
'''
try:
df = df.iloc[[baseLine,compLine]]
except IndexError:
return -1
total = 0
noChange = 0
first = True
for series_name, series in df.items():
if first:
first = False
else:
total += 1
#print(series_name)
if series[baseLine] != -1:
if series[compLine] != -1:
if series[baseLine] == series[compLine]:
noChange += 1
accuracy = noChange / total
return accuracy
def getMeanAccuracy(accList):
'''
:param accList: List of accuracy Values of a comparison
:return: Average of the accuracy values while ignoring the '-1' values
'''
out = []
for row in accList:
row = [x for x in row if x != -1]
out.append(np.average(row))
return out

Binary file not shown.

47
createAccuracyValues.py Normal file
View File

@ -0,0 +1,47 @@
import Data_Analysis as DA
import pandas as pd
import os
import re
deltaList = [1, 2, 10, 20]
#1 = 1 Scrape Interval
#2 = ca. 1 Woche
#10 = 1 Monat (30Tage)
#20 = 2 Monate
directory = os.fsencode("dok")
columnNames = ['property_id', 'timedelay_1', 'timedelay_2','timedelay_10','timedelay_20']
accListDf = pd.DataFrame(columns = columnNames)
accMeanDf = pd.DataFrame(columns = columnNames)
for file in os.listdir(directory):
filename = os.fsdecode(file)
if filename.endswith(".csv"):
propId = re.findall("\d+", filename)[0]
print(propId)
df = pd.read_csv(f'dok/{filename}')
fullList = []
accList = []
#Loop though all deltas in the deltaList
for delta in deltaList:
accList = []
#Loop through all Dates as Baseline date
for i in range(df.shape[0]):
acc = DA.getAccuracy(df, i, i+delta)
accList.append(acc)
fullList.append(accList)
meanList = DA.getMeanAccuracy(fullList)
accListDf = accListDf._append({'property_id': propId, 'timedelay_1': fullList[0], 'timedelay_2': fullList[1], 'timedelay_10': fullList[2], 'timedelay_20': fullList[3]}, ignore_index=True)
accMeanDf = accMeanDf._append({'property_id': propId, 'timedelay_1': meanList[0], 'timedelay_2': meanList[1], 'timedelay_10': meanList[2], 'timedelay_20': meanList[3]}, ignore_index=True)
accListDf.to_csv('results/accListDf.csv', index=False)
accMeanDf.to_csv('results/accMeanDf.csv', index=False)

20
createLostPropertyList.py Normal file
View File

@ -0,0 +1,20 @@
import Data_Analysis as DA
import csv
propIds = DA.getuniquePropIdFromDB()
lostProperties = []
for propId in propIds:
print(propId)
scrapeDates, calendarData = DA.getDataFromDB(propId)
if DA.checkForLostProprty(calendarData):
lostProperties.append(propId)
print(f"{len(lostProperties)} of {len(propIds)} properties are lost")
with open('results/allLostProperties', 'w') as f:
write = csv.writer(f)
write.writerow(lostProperties)
#Output: 221 of 1552 properties are lost

28
createPropCSV.py Normal file
View File

@ -0,0 +1,28 @@
import Data_Analysis as DA
import pandas as pd
import os
propIds = DA.getuniquePropIdFromDB()
for propId in propIds:
name = f"dok/calendarData_prop{propId}.csv"
if not os.path.exists(name):
print(propId)
scrapeDates, calendarData = DA.getDataFromDB(propId)
if DA.checkForLostProprty(calendarData):
print(f"Lost Proprty: {propId}")
else:
scrapeDates = DA.reformatScrapeDates(scrapeDates)
HeaderDates = DA.getMinMaxDate(calendarData)
data = DA.creatDataMatrix(HeaderDates, calendarData)
# Transform to Dataframe for Plotly
df = pd.DataFrame(data, columns=HeaderDates)
df.insert(0, "ScrapeDate", scrapeDates, True)
df = df.drop(index=0) # Irregulärer Abstand in den Scraping Zeiten (nur 2 Tage)
df = df.drop(df.columns[[1, 2]], axis=1)
df.to_csv(name, index=False)

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

Some files were not shown because too many files have changed in this diff Show More