ETL update
Untersuchungen: - Welche Properties gibt es, die nicht mehr gefunden werden - Wie genau sind die Daten 3 Tage, 1 Woche, 1 Monat und 2 Monate im voraus Alle Relevanten Resultate liegen im results Ordnerrefactor-to-mono
parent
fde2f7ffb7
commit
c7d58c2b23
163
Data_Analysis.py
163
Data_Analysis.py
|
@ -1,14 +1,157 @@
|
|||
import MySQLdb
|
||||
|
||||
db = MySQLdb.connect(host="localhost",user="root",passwd="admin",db="consultancy")
|
||||
cur = db.cursor()
|
||||
|
||||
cur.execute("SELECT JSON_EXTRACT(header, '$.Date') "
|
||||
"FROM extractions ")
|
||||
dateoutput = cur.fetchall()
|
||||
db.close()
|
||||
|
||||
print(dateoutput)
|
||||
import json
|
||||
from datetime import datetime, timedelta
|
||||
import numpy as np
|
||||
|
||||
|
||||
def getDataFromDB(propId):
|
||||
'''
|
||||
Function to get data from MySQL database filter with the given propId
|
||||
:return: scrapeDates and calendarData
|
||||
'''
|
||||
db = MySQLdb.connect(host="localhost",user="root",passwd="admin",db="consultancy")
|
||||
cur = db.cursor()
|
||||
|
||||
cur.execute("SELECT JSON_EXTRACT(header, '$.Date') "
|
||||
"FROM extractions "
|
||||
f"WHERE type='calendar' AND property_id = {propId};")
|
||||
scrapeDates = cur.fetchall()
|
||||
|
||||
cur.execute("SELECT JSON_EXTRACT(body, '$.content.days') "
|
||||
"FROM extractions "
|
||||
f"WHERE type='calendar' AND property_id = {propId};")
|
||||
calendarData = cur.fetchall()
|
||||
db.close()
|
||||
|
||||
return scrapeDates, calendarData
|
||||
|
||||
def getuniquePropIdFromDB():
|
||||
'''
|
||||
Function to get unique propId from MySQL database
|
||||
:return: propList
|
||||
'''
|
||||
db = MySQLdb.connect(host="localhost",user="root",passwd="admin",db="consultancy")
|
||||
cur = db.cursor()
|
||||
cur.execute("SELECT DISTINCT property_id "
|
||||
"FROM extractions;")
|
||||
propIds = cur.fetchall()
|
||||
db.close()
|
||||
|
||||
propList = []
|
||||
for propId in propIds:
|
||||
propList.append(propId[0])
|
||||
|
||||
return propList
|
||||
|
||||
def reformatScrapeDates(scrapeDatesIn):
|
||||
'''
|
||||
Reformats the scrapeDates column to a shortened datetime format
|
||||
:param scrapeDatesIn:
|
||||
:return:
|
||||
'''
|
||||
scrapeDates = []
|
||||
for row in scrapeDatesIn:
|
||||
date = datetime.strptime(json.loads(row[0])[0], '%a, %d %b %Y %H:%M:%S %Z').date()
|
||||
str = date.strftime('%Y-%m-%d')
|
||||
scrapeDates.append(str)
|
||||
|
||||
return scrapeDates
|
||||
|
||||
def checkForLostProprty(calendarData):
|
||||
'''
|
||||
Checks if there are "None" Entries in the calendarData meaning they were no longer found
|
||||
:param calendarData:
|
||||
:return: Boolean indicating if there are "None" Entries in the calendarData
|
||||
'''
|
||||
for row in calendarData:
|
||||
if None in row:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def getMinMaxDate(calendarData):
|
||||
'''
|
||||
Gets the min and max values from a calendar data
|
||||
:param calendarData: get all calendar data from querry
|
||||
:return: the minimal and maximal date
|
||||
'''
|
||||
#minimales und maximales Datum ermitteln
|
||||
fullDateList = []
|
||||
for row in calendarData:
|
||||
tempJson = json.loads(row[0]).keys()
|
||||
for key in tempJson:
|
||||
#print(key)
|
||||
fullDateList.append(datetime.strptime(key, '%Y-%m-%d').date())
|
||||
|
||||
end_dt = max(fullDateList)
|
||||
start_dt = min(fullDateList)
|
||||
delta = timedelta(days=1)
|
||||
HeaderDates = []
|
||||
|
||||
while start_dt <= end_dt:
|
||||
HeaderDates.append(start_dt)
|
||||
start_dt += delta
|
||||
|
||||
return HeaderDates
|
||||
|
||||
|
||||
def creatDataMatrix(HeaderDates, calendarData):
|
||||
'''
|
||||
Creates the data matrix from a calendar data
|
||||
:param HeaderDates: The list of all possible Dates in the dataset is used as the headers
|
||||
:param calendarData: the main information from the sql querry
|
||||
:return: data Matrix with all the dates in the dataset
|
||||
'''
|
||||
data = []
|
||||
for row in calendarData:
|
||||
tempList = [-1] * len(HeaderDates)
|
||||
tempJson = json.loads(row[0])
|
||||
for key in tempJson:
|
||||
date = datetime.strptime(key, '%Y-%m-%d').date()
|
||||
content = tempJson[key]
|
||||
index = [i for i, x in enumerate(HeaderDates) if x == date]
|
||||
tempList[index[0]] = content
|
||||
data.append(tempList)
|
||||
|
||||
return data
|
||||
|
||||
|
||||
def getAccuracy(df, baseLine, compLine):
|
||||
'''
|
||||
Calculates the accuracy of a given dataframe with a given baseLine and compLine
|
||||
:param df:
|
||||
:param baseLine:
|
||||
:param compLine:
|
||||
:return: Accuracy: The percentage of dates that had the same information in both baseLine and compLine
|
||||
'''
|
||||
try:
|
||||
df = df.iloc[[baseLine,compLine]]
|
||||
except IndexError:
|
||||
return -1
|
||||
total = 0
|
||||
noChange = 0
|
||||
first = True
|
||||
for series_name, series in df.items():
|
||||
if first:
|
||||
first = False
|
||||
else:
|
||||
total += 1
|
||||
#print(series_name)
|
||||
if series[baseLine] != -1:
|
||||
if series[compLine] != -1:
|
||||
if series[baseLine] == series[compLine]:
|
||||
noChange += 1
|
||||
|
||||
accuracy = noChange / total
|
||||
return accuracy
|
||||
|
||||
def getMeanAccuracy(accList):
|
||||
'''
|
||||
:param accList: List of accuracy Values of a comparison
|
||||
:return: Average of the accuracy values while ignoring the '-1' values
|
||||
'''
|
||||
out = []
|
||||
for row in accList:
|
||||
row = [x for x in row if x != -1]
|
||||
out.append(np.average(row))
|
||||
return out
|
||||
|
|
Binary file not shown.
|
@ -0,0 +1,47 @@
|
|||
import Data_Analysis as DA
|
||||
import pandas as pd
|
||||
import os
|
||||
import re
|
||||
|
||||
|
||||
|
||||
|
||||
deltaList = [1, 2, 10, 20]
|
||||
#1 = 1 Scrape Interval
|
||||
#2 = ca. 1 Woche
|
||||
#10 = 1 Monat (30Tage)
|
||||
#20 = 2 Monate
|
||||
|
||||
|
||||
directory = os.fsencode("dok")
|
||||
|
||||
columnNames = ['property_id', 'timedelay_1', 'timedelay_2','timedelay_10','timedelay_20']
|
||||
accListDf = pd.DataFrame(columns = columnNames)
|
||||
accMeanDf = pd.DataFrame(columns = columnNames)
|
||||
|
||||
|
||||
for file in os.listdir(directory):
|
||||
filename = os.fsdecode(file)
|
||||
if filename.endswith(".csv"):
|
||||
propId = re.findall("\d+", filename)[0]
|
||||
print(propId)
|
||||
df = pd.read_csv(f'dok/{filename}')
|
||||
fullList = []
|
||||
accList = []
|
||||
#Loop though all deltas in the deltaList
|
||||
for delta in deltaList:
|
||||
accList = []
|
||||
#Loop through all Dates as Baseline date
|
||||
for i in range(df.shape[0]):
|
||||
acc = DA.getAccuracy(df, i, i+delta)
|
||||
accList.append(acc)
|
||||
fullList.append(accList)
|
||||
|
||||
|
||||
meanList = DA.getMeanAccuracy(fullList)
|
||||
accListDf = accListDf._append({'property_id': propId, 'timedelay_1': fullList[0], 'timedelay_2': fullList[1], 'timedelay_10': fullList[2], 'timedelay_20': fullList[3]}, ignore_index=True)
|
||||
accMeanDf = accMeanDf._append({'property_id': propId, 'timedelay_1': meanList[0], 'timedelay_2': meanList[1], 'timedelay_10': meanList[2], 'timedelay_20': meanList[3]}, ignore_index=True)
|
||||
|
||||
|
||||
accListDf.to_csv('results/accListDf.csv', index=False)
|
||||
accMeanDf.to_csv('results/accMeanDf.csv', index=False)
|
|
@ -0,0 +1,20 @@
|
|||
import Data_Analysis as DA
|
||||
import csv
|
||||
|
||||
propIds = DA.getuniquePropIdFromDB()
|
||||
|
||||
lostProperties = []
|
||||
|
||||
for propId in propIds:
|
||||
print(propId)
|
||||
scrapeDates, calendarData = DA.getDataFromDB(propId)
|
||||
if DA.checkForLostProprty(calendarData):
|
||||
lostProperties.append(propId)
|
||||
|
||||
print(f"{len(lostProperties)} of {len(propIds)} properties are lost")
|
||||
|
||||
with open('results/allLostProperties', 'w') as f:
|
||||
write = csv.writer(f)
|
||||
write.writerow(lostProperties)
|
||||
|
||||
#Output: 221 of 1552 properties are lost
|
|
@ -0,0 +1,28 @@
|
|||
import Data_Analysis as DA
|
||||
import pandas as pd
|
||||
import os
|
||||
|
||||
|
||||
|
||||
propIds = DA.getuniquePropIdFromDB()
|
||||
|
||||
for propId in propIds:
|
||||
name = f"dok/calendarData_prop{propId}.csv"
|
||||
if not os.path.exists(name):
|
||||
print(propId)
|
||||
scrapeDates, calendarData = DA.getDataFromDB(propId)
|
||||
if DA.checkForLostProprty(calendarData):
|
||||
print(f"Lost Proprty: {propId}")
|
||||
else:
|
||||
scrapeDates = DA.reformatScrapeDates(scrapeDates)
|
||||
HeaderDates = DA.getMinMaxDate(calendarData)
|
||||
data = DA.creatDataMatrix(HeaderDates, calendarData)
|
||||
|
||||
# Transform to Dataframe for Plotly
|
||||
df = pd.DataFrame(data, columns=HeaderDates)
|
||||
df.insert(0, "ScrapeDate", scrapeDates, True)
|
||||
|
||||
df = df.drop(index=0) # Irregulärer Abstand in den Scraping Zeiten (nur 2 Tage)
|
||||
df = df.drop(df.columns[[1, 2]], axis=1)
|
||||
df.to_csv(name, index=False)
|
||||
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue