ConsultancyProject_2_ETL/Data_Analysis.py

200 lines
5.7 KiB
Python

import MySQLdb
import json
from datetime import datetime, timedelta
import numpy as np
def getPropertyDataFromDB():
db = MySQLdb.connect(host="localhost",user="root",passwd="admin",db="consultancy")
cur = db.cursor()
cur.execute("SELECT id, seed_id, check_data "
"FROM properties ")
propData = cur.fetchall()
db.close()
return propData
def getDataFromDB(propId):
'''
Function to get data from MySQL database filter with the given propId
:return: scrapeDates and calendarData
'''
db = MySQLdb.connect(host="localhost",user="root",passwd="admin",db="consultancy")
cur = db.cursor()
cur.execute("SELECT JSON_EXTRACT(header, '$.Date') "
"FROM extractions "
f"WHERE type='calendar' AND property_id = {propId};")
scrapeDates = cur.fetchall()
cur.execute("SELECT JSON_EXTRACT(body, '$.content.days') "
"FROM extractions "
f"WHERE type='calendar' AND property_id = {propId};")
calendarData = cur.fetchall()
db.close()
return scrapeDates, calendarData
def getUniqueScrapeDates():
db = MySQLdb.connect(host="localhost",user="root",passwd="admin",db="consultancy")
cur = db.cursor()
cur.execute("SELECT JSON_EXTRACT(header, '$.Date') "
"FROM extractions "
f"WHERE type='calendar'")
uniqueScrapeDates = cur.fetchall()
db.close()
return uniqueScrapeDates
def getPropsPerScrape(scrapeDate):
date = datetime.strptime(scrapeDate, '%Y-%m-%d')
end_date = date + timedelta(days=1)
db = MySQLdb.connect(host="localhost",user="root",passwd="admin",db="consultancy")
cur = db.cursor()
cur.execute("SELECT property_id "
"FROM extractions "
f"WHERE type='calendar' AND created_at > '{scrapeDate}' AND created_at < '{str(end_date)}'")
uniqueScrapeDates = cur.fetchall()
db.close()
return uniqueScrapeDates
def getuniquePropIdFromDB():
'''
Function to get unique propId from MySQL database
:return: propList
'''
db = MySQLdb.connect(host="localhost",user="root",passwd="admin",db="consultancy")
cur = db.cursor()
cur.execute("SELECT DISTINCT property_id "
"FROM extractions;")
propIds = cur.fetchall()
db.close()
propList = []
for propId in propIds:
propList.append(propId[0])
return propList
def reformatScrapeDates(scrapeDatesIn):
'''
Reformats the scrapeDates column to a shortened datetime format
:param scrapeDatesIn:
:return:
'''
scrapeDates = []
for row in scrapeDatesIn:
date = datetime.strptime(json.loads(row[0])[0], '%a, %d %b %Y %H:%M:%S %Z').date()
str = date.strftime('%Y-%m-%d')
scrapeDates.append(str)
return scrapeDates
def checkForLostProprty(calendarData):
'''
Checks if there are "None" Entries in the calendarData meaning they were no longer found
:param calendarData:
:return: Boolean indicating if there are "None" Entries in the calendarData
'''
for row in calendarData:
if None in row:
return True
return False
def getMinMaxDate(calendarData):
'''
Gets the min and max values from a calendar data
:param calendarData: get all calendar data from querry
:return: the minimal and maximal date
'''
#minimales und maximales Datum ermitteln
fullDateList = []
for row in calendarData:
tempJson = json.loads(row[0]).keys()
for key in tempJson:
#print(key)
fullDateList.append(datetime.strptime(key, '%Y-%m-%d').date())
end_dt = max(fullDateList)
start_dt = min(fullDateList)
delta = timedelta(days=1)
HeaderDates = []
while start_dt <= end_dt:
HeaderDates.append(start_dt)
start_dt += delta
return HeaderDates
def creatDataMatrix(HeaderDates, calendarData):
'''
Creates the data matrix from a calendar data
:param HeaderDates: The list of all possible Dates in the dataset is used as the headers
:param calendarData: the main information from the sql querry
:return: data Matrix with all the dates in the dataset
'''
data = []
for row in calendarData:
tempList = [-1] * len(HeaderDates)
tempJson = json.loads(row[0])
for key in tempJson:
date = datetime.strptime(key, '%Y-%m-%d').date()
content = tempJson[key]
index = [i for i, x in enumerate(HeaderDates) if x == date]
tempList[index[0]] = content
data.append(tempList)
return data
def getAccuracy(df, baseLine, compLine):
'''
Calculates the accuracy of a given dataframe with a given baseLine and compLine
:param df:
:param baseLine:
:param compLine:
:return: Accuracy: The percentage of dates that had the same information in both baseLine and compLine
'''
try:
df = df.iloc[[baseLine,compLine]]
except IndexError:
return -1
total = 0
noChange = 0
first = True
for series_name, series in df.items():
if first:
first = False
else:
total += 1
#print(series_name)
if series[baseLine] != -1:
if series[compLine] != -1:
if series[baseLine] == series[compLine]:
noChange += 1
accuracy = noChange / total
return accuracy
def getMeanAccuracy(accList):
'''
Get the mean Accuracy of the entire timedelay of one property
:param accList: List of accuracy Values of a comparison
:return: Average of the accuracy values while ignoring the '-1' values
'''
out = []
for row in accList:
row = [x for x in row if x != -1]
out.append(np.average(row))
return out