ConsultancyProject_2_ETL/Data_Analysis.py

import MySQLdb
import json
from datetime import datetime, timedelta
import numpy as np


def getPropertyDataFromDB():
    db = MySQLdb.connect(host="localhost",user="root",passwd="admin",db="consultancy")
    cur = db.cursor()

    cur.execute("SELECT id, seed_id, check_data "
                "FROM properties ")
    propData = cur.fetchall()
    db.close()

    return propData

def getDataFromDB(propId):
    '''
    Function to get data from MySQL database filter with the given propId
    :return: scrapeDates and calendarData
    '''
    db = MySQLdb.connect(host="localhost",user="root",passwd="admin",db="consultancy")
    cur = db.cursor()

    cur.execute("SELECT JSON_EXTRACT(header, '$.Date') "
                "FROM extractions "
                f"WHERE type='calendar' AND property_id = {propId};")
    scrapeDates = cur.fetchall()

    cur.execute("SELECT JSON_EXTRACT(body, '$.content.days') "
                "FROM extractions "
                f"WHERE type='calendar' AND property_id = {propId};")
    calendarData = cur.fetchall()
    db.close()

    return scrapeDates, calendarData

def getUniqueScrapeDates():
    db = MySQLdb.connect(host="localhost",user="root",passwd="admin",db="consultancy")
    cur = db.cursor()

    cur.execute("SELECT JSON_EXTRACT(header, '$.Date') "
                "FROM extractions "
                f"WHERE type='calendar'")
    uniqueScrapeDates = cur.fetchall()
    db.close()

    return uniqueScrapeDates

def getPropsPerScrape(scrapeDate):
    date = datetime.strptime(scrapeDate, '%Y-%m-%d')
    end_date = date + timedelta(days=1)

    db = MySQLdb.connect(host="localhost",user="root",passwd="admin",db="consultancy")
    cur = db.cursor()

    cur.execute("SELECT property_id "
                "FROM extractions "
                f"WHERE type='calendar' AND created_at > '{scrapeDate}' AND created_at < '{str(end_date)}'")
    uniqueScrapeDates = cur.fetchall()
    db.close()

    return uniqueScrapeDates

def getuniquePropIdFromDB():
    '''
    Function to get unique propId from MySQL database
    :return: propList
    '''
    db = MySQLdb.connect(host="localhost",user="root",passwd="admin",db="consultancy")
    cur = db.cursor()
    cur.execute("SELECT DISTINCT property_id "
                "FROM extractions;")
    propIds = cur.fetchall()
    db.close()

    propList = []
    for propId in propIds:
        propList.append(propId[0])

    return propList

def reformatScrapeDates(scrapeDatesIn):
    '''
    Reformats the scrapeDates column to a shortened datetime format
    :param scrapeDatesIn:
    :return:
    '''
    scrapeDates = []
    for row in scrapeDatesIn:
        date = datetime.strptime(json.loads(row[0])[0], '%a, %d %b %Y %H:%M:%S %Z').date()
        str = date.strftime('%Y-%m-%d')
        scrapeDates.append(str)

    return scrapeDates

def checkForLostProprty(calendarData):
    '''
    Checks if there are "None" Entries in the calendarData meaning they were no longer found
    :param calendarData:
    :return: Boolean indicating if there are "None" Entries in the calendarData
    '''
    for row in calendarData:
        if None in row:
            return True
    return False


def getMinMaxDate(calendarData):
    '''
    Gets the min and max values from a calendar data
    :param calendarData: get all calendar data from querry
    :return: the minimal and maximal date
    '''
    #minimales und maximales Datum ermitteln
    fullDateList = []
    for row in calendarData:
        tempJson = json.loads(row[0]).keys()
        for key in tempJson:
            #print(key)
            fullDateList.append(datetime.strptime(key, '%Y-%m-%d').date())

    end_dt = max(fullDateList)
    start_dt = min(fullDateList)
    delta = timedelta(days=1)
    HeaderDates = []

    while start_dt <= end_dt:
        HeaderDates.append(start_dt)
        start_dt += delta

    return HeaderDates


def creatDataMatrix(HeaderDates, calendarData):
    '''
    Creates the data matrix from a calendar data
    :param HeaderDates: The list of all possible Dates in the dataset is used as the headers
    :param calendarData: the main information from the sql querry
    :return: data Matrix with all the dates in the dataset
    '''
    data = []
    for row in calendarData:
        tempList = [-1] * len(HeaderDates)
        tempJson = json.loads(row[0])
        for key in tempJson:
            date = datetime.strptime(key, '%Y-%m-%d').date()
            content = tempJson[key]
            index = [i for i, x in enumerate(HeaderDates) if x == date]
            tempList[index[0]] = content
        data.append(tempList)

    return data


def getAccuracy(df, baseLine, compLine):
    '''
    Calculates the accuracy of a given dataframe with a given baseLine and compLine
    :param df:
    :param baseLine:
    :param compLine:
    :return: Accuracy: The percentage of dates that had the same information in both baseLine and compLine
    '''
    try:
        df = df.iloc[[baseLine,compLine]]
    except IndexError:
        return -1
    total = 0
    noChange = 0
    first = True
    for series_name, series in df.items():
        if first:
            first = False
        else:
            total += 1
            #print(series_name)
            if series[baseLine] != -1:
                if series[compLine] != -1:
                    if series[baseLine] == series[compLine]:
                        noChange += 1

    accuracy = noChange / total
    return accuracy

def getMeanAccuracy(accList):
    '''
    Get the mean Accuracy of the entire timedelay of one property
    :param accList: List of accuracy Values of a comparison
    :return: Average of the accuracy values while ignoring the '-1' values
    '''
    out = []
    for row in accList:
        row = [x for x in row if x != -1]
        out.append(np.average(row))
    return out