ConsultancyProject_2_ETL/etl/src/mauro/data_quality.py

122 lines
2.9 KiB
Python

from etl.src import data
import json
import polars as pl
from datetime import datetime
import matplotlib.pyplot as plt
import numpy as np
'''
# Get Data from DB
inst = data.load()
df = inst.extractions_with_region().pl()
print(df)
counter = 0
data = []
for row in df.iter_rows():
property_id = row[1]
created_at = row[2].date()
dict = {'property_id': property_id, 'created_at': created_at, 'name': row[3]}
jsonStr = row[0]
if jsonStr:
calendarDict = json.loads(jsonStr)
for key in calendarDict:
dict[key] = calendarDict[key]
data.append(dict)
dfNew = pl.from_dicts(data)
dfNew.write_csv('results/data_quality.csv')
print(dfNew)
'''
dfNew = pl.read_csv('results/data_quality.csv')
dfNew = dfNew.with_columns(pl.col("created_at").map_elements(lambda x: datetime.strptime(x, "%Y-%m-%d").date()))
# Create Row Means
dfTemp = dfNew
# Temporary Remove leading columns but save for later
prop = dfTemp.get_column('property_id')
dfTemp = dfTemp.drop('property_id')
crea = dfTemp.get_column('created_at')
dfTemp = dfTemp.drop('created_at')
name = dfTemp.get_column('name')
dfTemp = dfTemp.drop('name')
dfTemp = dfTemp.with_columns(sum=pl.sum_horizontal(dfTemp.columns))
sumCol = dfTemp.get_column('sum')
# Create new DF with only property_id, created_at ,Location name and sum
df = pl.DataFrame([prop, crea, name, sumCol])
df = df.sort('created_at')
# Create Full Copy
# 0 = Alles
# 1 = Heidiland
# 2 = Davos
# 3 = Engadin
# 4 = St. Moritz
filterList = ['Alle Regionen', 'Heidiland', 'Davos', 'Engadin', 'St. Moritz']
filter = 4
if filter != 0:
df = df.filter(pl.col("name") == filter)
# Remove Location name
df = df.drop('name')
# Get unique property_ids
propsIDs = df.unique(subset=["property_id"])
propsIDs = propsIDs.get_column("property_id").to_list()
propsIDs.sort()
# create Matrix
matrix = []
for id in propsIDs:
dict = {}
temp = df.filter(pl.col("property_id") == id)
for row in temp.iter_rows():
dict[row[1].strftime('%Y-%m-%d')] = row[2]
matrix.append(dict)
matrix = pl.DataFrame(matrix)
dates = matrix.columns
matrix = matrix.to_numpy()
# normalized
matrix = matrix/1111
yRange = range(len(dates))
xRange = range(len(propsIDs))
matrix = matrix.T
plt.imshow(matrix)
plt.yticks(yRange[::5], dates[::5])
plt.xticks(xRange[::10], propsIDs[::10])
plt.title(filterList[filter])
plt.xlabel("Property ID")
plt.ylabel("Scrape Date")
plt.colorbar()
plt.tight_layout()
# Create DiffMatrix
diffMatrix = np.zeros((len(matrix)-1, len(matrix[0])))
for y in range(len(matrix[0])):
for x in range(len(matrix)-1):
diffMatrix[x][y] = abs(matrix[x][y] - matrix[x+1][y])
plt.figure()
plt.imshow(diffMatrix, cmap="Reds")
plt.yticks(yRange[::5], dates[::5])
plt.xticks(xRange[::10], propsIDs[::10])
plt.title(filterList[filter])
plt.xlabel("Property ID")
plt.ylabel("Scrape Date")
plt.colorbar()
plt.tight_layout()
plt.show()