ConsultancyProject_2_ETL/etl/src/mauro/data_quality.py

122 lines
2.9 KiB
Python
Raw Normal View History

from etl.src import data
import json
import polars as pl
from datetime import datetime
import matplotlib.pyplot as plt
import numpy as np
'''
# Get Data from DB
inst = data.load()
df = inst.extractions_with_region().pl()
print(df)
counter = 0
data = []
for row in df.iter_rows():
property_id = row[1]
created_at = row[2].date()
dict = {'property_id': property_id, 'created_at': created_at, 'name': row[3]}
jsonStr = row[0]
if jsonStr:
calendarDict = json.loads(jsonStr)
for key in calendarDict:
dict[key] = calendarDict[key]
data.append(dict)
dfNew = pl.from_dicts(data)
dfNew.write_csv('results/data_quality.csv')
print(dfNew)
'''
dfNew = pl.read_csv('results/data_quality.csv')
dfNew = dfNew.with_columns(pl.col("created_at").map_elements(lambda x: datetime.strptime(x, "%Y-%m-%d").date()))
# Create Row Means
dfTemp = dfNew
# Temporary Remove leading columns but save for later
prop = dfTemp.get_column('property_id')
dfTemp = dfTemp.drop('property_id')
crea = dfTemp.get_column('created_at')
dfTemp = dfTemp.drop('created_at')
name = dfTemp.get_column('name')
dfTemp = dfTemp.drop('name')
dfTemp = dfTemp.with_columns(sum=pl.sum_horizontal(dfTemp.columns))
sumCol = dfTemp.get_column('sum')
# Create new DF with only property_id, created_at ,Location name and sum
df = pl.DataFrame([prop, crea, name, sumCol])
df = df.sort('created_at')
# Create Full Copy
# 0 = Alles
# 1 = Heidiland
# 2 = Davos
# 3 = Engadin
# 4 = St. Moritz
filterList = ['Alle Regionen', 'Heidiland', 'Davos', 'Engadin', 'St. Moritz']
filter = 4
if filter != 0:
df = df.filter(pl.col("name") == filter)
# Remove Location name
df = df.drop('name')
# Get unique property_ids
propsIDs = df.unique(subset=["property_id"])
propsIDs = propsIDs.get_column("property_id").to_list()
propsIDs.sort()
# create Matrix
matrix = []
for id in propsIDs:
dict = {}
temp = df.filter(pl.col("property_id") == id)
for row in temp.iter_rows():
dict[row[1].strftime('%Y-%m-%d')] = row[2]
matrix.append(dict)
matrix = pl.DataFrame(matrix)
dates = matrix.columns
matrix = matrix.to_numpy()
# normalized
matrix = matrix/1111
yRange = range(len(dates))
xRange = range(len(propsIDs))
matrix = matrix.T
plt.imshow(matrix)
plt.yticks(yRange[::5], dates[::5])
plt.xticks(xRange[::10], propsIDs[::10])
plt.title(filterList[filter])
plt.xlabel("Property ID")
plt.ylabel("Scrape Date")
plt.colorbar()
plt.tight_layout()
# Create DiffMatrix
diffMatrix = np.zeros((len(matrix)-1, len(matrix[0])))
for y in range(len(matrix[0])):
for x in range(len(matrix)-1):
diffMatrix[x][y] = abs(matrix[x][y] - matrix[x+1][y])
plt.figure()
plt.imshow(diffMatrix, cmap="Reds")
plt.yticks(yRange[::5], dates[::5])
plt.xticks(xRange[::10], propsIDs[::10])
plt.title(filterList[filter])
plt.xlabel("Property ID")
plt.ylabel("Scrape Date")
plt.colorbar()
plt.tight_layout()
plt.show()