import MySQLdb import json from datetime import datetime, timedelta import numpy as np def getPropertyDataFromDB(): db = MySQLdb.connect(host="localhost",user="root",passwd="admin",db="consultancy") cur = db.cursor() cur.execute("SELECT id, seed_id, check_data " "FROM properties ") propData = cur.fetchall() db.close() return propData def getDataFromDB(propId): ''' Function to get data from MySQL database filter with the given propId :return: scrapeDates and calendarData ''' db = MySQLdb.connect(host="localhost",user="root",passwd="admin",db="consultancy") cur = db.cursor() cur.execute("SELECT JSON_EXTRACT(header, '$.Date') " "FROM extractions " f"WHERE type='calendar' AND property_id = {propId};") scrapeDates = cur.fetchall() cur.execute("SELECT JSON_EXTRACT(body, '$.content.days') " "FROM extractions " f"WHERE type='calendar' AND property_id = {propId};") calendarData = cur.fetchall() db.close() return scrapeDates, calendarData def getUniqueScrapeDates(): db = MySQLdb.connect(host="localhost",user="root",passwd="admin",db="consultancy") cur = db.cursor() cur.execute("SELECT JSON_EXTRACT(header, '$.Date') " "FROM extractions " f"WHERE type='calendar'") uniqueScrapeDates = cur.fetchall() db.close() return uniqueScrapeDates def getPropsPerScrape(scrapeDate): date = datetime.strptime(scrapeDate, '%Y-%m-%d') end_date = date + timedelta(days=1) db = MySQLdb.connect(host="localhost",user="root",passwd="admin",db="consultancy") cur = db.cursor() cur.execute("SELECT property_id " "FROM extractions " f"WHERE type='calendar' AND created_at > '{scrapeDate}' AND created_at < '{str(end_date)}'") uniqueScrapeDates = cur.fetchall() db.close() return uniqueScrapeDates def getuniquePropIdFromDB(): ''' Function to get unique propId from MySQL database :return: propList ''' db = MySQLdb.connect(host="localhost",user="root",passwd="admin",db="consultancy") cur = db.cursor() cur.execute("SELECT DISTINCT property_id " "FROM extractions;") propIds = cur.fetchall() db.close() propList = [] for propId in propIds: propList.append(propId[0]) return propList def reformatScrapeDates(scrapeDatesIn): ''' Reformats the scrapeDates column to a shortened datetime format :param scrapeDatesIn: :return: ''' scrapeDates = [] for row in scrapeDatesIn: date = datetime.strptime(json.loads(row[0])[0], '%a, %d %b %Y %H:%M:%S %Z').date() str = date.strftime('%Y-%m-%d') scrapeDates.append(str) return scrapeDates def checkForLostProprty(calendarData): ''' Checks if there are "None" Entries in the calendarData meaning they were no longer found :param calendarData: :return: Boolean indicating if there are "None" Entries in the calendarData ''' for row in calendarData: if None in row: return True return False def getMinMaxDate(calendarData): ''' Gets the min and max values from a calendar data :param calendarData: get all calendar data from querry :return: the minimal and maximal date ''' #minimales und maximales Datum ermitteln fullDateList = [] for row in calendarData: tempJson = json.loads(row[0]).keys() for key in tempJson: #print(key) fullDateList.append(datetime.strptime(key, '%Y-%m-%d').date()) end_dt = max(fullDateList) start_dt = min(fullDateList) delta = timedelta(days=1) HeaderDates = [] while start_dt <= end_dt: HeaderDates.append(start_dt) start_dt += delta return HeaderDates def creatDataMatrix(HeaderDates, calendarData): ''' Creates the data matrix from a calendar data :param HeaderDates: The list of all possible Dates in the dataset is used as the headers :param calendarData: the main information from the sql querry :return: data Matrix with all the dates in the dataset ''' data = [] for row in calendarData: tempList = [-1] * len(HeaderDates) tempJson = json.loads(row[0]) for key in tempJson: date = datetime.strptime(key, '%Y-%m-%d').date() content = tempJson[key] index = [i for i, x in enumerate(HeaderDates) if x == date] tempList[index[0]] = content data.append(tempList) return data def getAccuracy(df, baseLine, compLine): ''' Calculates the accuracy of a given dataframe with a given baseLine and compLine :param df: :param baseLine: :param compLine: :return: Accuracy: The percentage of dates that had the same information in both baseLine and compLine ''' try: df = df.iloc[[baseLine,compLine]] except IndexError: return -1 total = 0 noChange = 0 first = True for series_name, series in df.items(): if first: first = False else: total += 1 #print(series_name) if series[baseLine] != -1: if series[compLine] != -1: if series[baseLine] == series[compLine]: noChange += 1 accuracy = noChange / total return accuracy def getMeanAccuracy(accList): ''' Get the mean Accuracy of the entire timedelay of one property :param accList: List of accuracy Values of a comparison :return: Average of the accuracy values while ignoring the '-1' values ''' out = [] for row in accList: row = [x for x in row if x != -1] out.append(np.average(row)) return out