200 lines
5.7 KiB
Python
200 lines
5.7 KiB
Python
import MySQLdb
|
|
import json
|
|
from datetime import datetime, timedelta
|
|
import numpy as np
|
|
|
|
|
|
def getPropertyDataFromDB():
|
|
db = MySQLdb.connect(host="localhost",user="root",passwd="admin",db="consultancy")
|
|
cur = db.cursor()
|
|
|
|
cur.execute("SELECT id, seed_id, check_data "
|
|
"FROM properties ")
|
|
propData = cur.fetchall()
|
|
db.close()
|
|
|
|
return propData
|
|
|
|
def getDataFromDB(propId):
|
|
'''
|
|
Function to get data from MySQL database filter with the given propId
|
|
:return: scrapeDates and calendarData
|
|
'''
|
|
db = MySQLdb.connect(host="localhost",user="root",passwd="admin",db="consultancy")
|
|
cur = db.cursor()
|
|
|
|
cur.execute("SELECT JSON_EXTRACT(header, '$.Date') "
|
|
"FROM extractions "
|
|
f"WHERE type='calendar' AND property_id = {propId};")
|
|
scrapeDates = cur.fetchall()
|
|
|
|
cur.execute("SELECT JSON_EXTRACT(body, '$.content.days') "
|
|
"FROM extractions "
|
|
f"WHERE type='calendar' AND property_id = {propId};")
|
|
calendarData = cur.fetchall()
|
|
db.close()
|
|
|
|
return scrapeDates, calendarData
|
|
|
|
def getUniqueScrapeDates():
|
|
db = MySQLdb.connect(host="localhost",user="root",passwd="admin",db="consultancy")
|
|
cur = db.cursor()
|
|
|
|
cur.execute("SELECT JSON_EXTRACT(header, '$.Date') "
|
|
"FROM extractions "
|
|
f"WHERE type='calendar'")
|
|
uniqueScrapeDates = cur.fetchall()
|
|
db.close()
|
|
|
|
return uniqueScrapeDates
|
|
|
|
def getPropsPerScrape(scrapeDate):
|
|
date = datetime.strptime(scrapeDate, '%Y-%m-%d')
|
|
end_date = date + timedelta(days=1)
|
|
|
|
db = MySQLdb.connect(host="localhost",user="root",passwd="admin",db="consultancy")
|
|
cur = db.cursor()
|
|
|
|
cur.execute("SELECT property_id "
|
|
"FROM extractions "
|
|
f"WHERE type='calendar' AND created_at > '{scrapeDate}' AND created_at < '{str(end_date)}'")
|
|
uniqueScrapeDates = cur.fetchall()
|
|
db.close()
|
|
|
|
return uniqueScrapeDates
|
|
|
|
def getuniquePropIdFromDB():
|
|
'''
|
|
Function to get unique propId from MySQL database
|
|
:return: propList
|
|
'''
|
|
db = MySQLdb.connect(host="localhost",user="root",passwd="admin",db="consultancy")
|
|
cur = db.cursor()
|
|
cur.execute("SELECT DISTINCT property_id "
|
|
"FROM extractions;")
|
|
propIds = cur.fetchall()
|
|
db.close()
|
|
|
|
propList = []
|
|
for propId in propIds:
|
|
propList.append(propId[0])
|
|
|
|
return propList
|
|
|
|
def reformatScrapeDates(scrapeDatesIn):
|
|
'''
|
|
Reformats the scrapeDates column to a shortened datetime format
|
|
:param scrapeDatesIn:
|
|
:return:
|
|
'''
|
|
scrapeDates = []
|
|
for row in scrapeDatesIn:
|
|
date = datetime.strptime(json.loads(row[0])[0], '%a, %d %b %Y %H:%M:%S %Z').date()
|
|
str = date.strftime('%Y-%m-%d')
|
|
scrapeDates.append(str)
|
|
|
|
return scrapeDates
|
|
|
|
def checkForLostProprty(calendarData):
|
|
'''
|
|
Checks if there are "None" Entries in the calendarData meaning they were no longer found
|
|
:param calendarData:
|
|
:return: Boolean indicating if there are "None" Entries in the calendarData
|
|
'''
|
|
for row in calendarData:
|
|
if None in row:
|
|
return True
|
|
return False
|
|
|
|
|
|
def getMinMaxDate(calendarData):
|
|
'''
|
|
Gets the min and max values from a calendar data
|
|
:param calendarData: get all calendar data from querry
|
|
:return: the minimal and maximal date
|
|
'''
|
|
#minimales und maximales Datum ermitteln
|
|
fullDateList = []
|
|
for row in calendarData:
|
|
tempJson = json.loads(row[0]).keys()
|
|
for key in tempJson:
|
|
#print(key)
|
|
fullDateList.append(datetime.strptime(key, '%Y-%m-%d').date())
|
|
|
|
end_dt = max(fullDateList)
|
|
start_dt = min(fullDateList)
|
|
delta = timedelta(days=1)
|
|
HeaderDates = []
|
|
|
|
while start_dt <= end_dt:
|
|
HeaderDates.append(start_dt)
|
|
start_dt += delta
|
|
|
|
return HeaderDates
|
|
|
|
|
|
def creatDataMatrix(HeaderDates, calendarData):
|
|
'''
|
|
Creates the data matrix from a calendar data
|
|
:param HeaderDates: The list of all possible Dates in the dataset is used as the headers
|
|
:param calendarData: the main information from the sql querry
|
|
:return: data Matrix with all the dates in the dataset
|
|
'''
|
|
data = []
|
|
for row in calendarData:
|
|
tempList = [-1] * len(HeaderDates)
|
|
tempJson = json.loads(row[0])
|
|
for key in tempJson:
|
|
date = datetime.strptime(key, '%Y-%m-%d').date()
|
|
content = tempJson[key]
|
|
index = [i for i, x in enumerate(HeaderDates) if x == date]
|
|
tempList[index[0]] = content
|
|
data.append(tempList)
|
|
|
|
return data
|
|
|
|
|
|
def getAccuracy(df, baseLine, compLine):
|
|
'''
|
|
Calculates the accuracy of a given dataframe with a given baseLine and compLine
|
|
:param df:
|
|
:param baseLine:
|
|
:param compLine:
|
|
:return: Accuracy: The percentage of dates that had the same information in both baseLine and compLine
|
|
'''
|
|
try:
|
|
df = df.iloc[[baseLine,compLine]]
|
|
except IndexError:
|
|
return -1
|
|
total = 0
|
|
noChange = 0
|
|
first = True
|
|
for series_name, series in df.items():
|
|
if first:
|
|
first = False
|
|
else:
|
|
total += 1
|
|
#print(series_name)
|
|
if series[baseLine] != -1:
|
|
if series[compLine] != -1:
|
|
if series[baseLine] == series[compLine]:
|
|
noChange += 1
|
|
|
|
accuracy = noChange / total
|
|
return accuracy
|
|
|
|
def getMeanAccuracy(accList):
|
|
'''
|
|
Get the mean Accuracy of the entire timedelay of one property
|
|
:param accList: List of accuracy Values of a comparison
|
|
:return: Average of the accuracy values while ignoring the '-1' values
|
|
'''
|
|
out = []
|
|
for row in accList:
|
|
row = [x for x in row if x != -1]
|
|
out.append(np.average(row))
|
|
return out
|
|
|
|
|
|
|