122 lines
2.9 KiB
Python
122 lines
2.9 KiB
Python
from etl.src import data
|
|
import json
|
|
import polars as pl
|
|
from datetime import datetime
|
|
import matplotlib.pyplot as plt
|
|
import numpy as np
|
|
|
|
'''
|
|
# Get Data from DB
|
|
inst = data.load()
|
|
|
|
df = inst.extractions_with_region().pl()
|
|
print(df)
|
|
|
|
counter = 0
|
|
data = []
|
|
for row in df.iter_rows():
|
|
property_id = row[1]
|
|
created_at = row[2].date()
|
|
dict = {'property_id': property_id, 'created_at': created_at, 'name': row[3]}
|
|
|
|
jsonStr = row[0]
|
|
if jsonStr:
|
|
calendarDict = json.loads(jsonStr)
|
|
for key in calendarDict:
|
|
dict[key] = calendarDict[key]
|
|
|
|
data.append(dict)
|
|
|
|
dfNew = pl.from_dicts(data)
|
|
dfNew.write_csv('results/data_quality.csv')
|
|
print(dfNew)
|
|
|
|
'''
|
|
dfNew = pl.read_csv('results/data_quality.csv')
|
|
dfNew = dfNew.with_columns(pl.col("created_at").map_elements(lambda x: datetime.strptime(x, "%Y-%m-%d").date()))
|
|
|
|
# Create Row Means
|
|
dfTemp = dfNew
|
|
# Temporary Remove leading columns but save for later
|
|
prop = dfTemp.get_column('property_id')
|
|
dfTemp = dfTemp.drop('property_id')
|
|
crea = dfTemp.get_column('created_at')
|
|
dfTemp = dfTemp.drop('created_at')
|
|
name = dfTemp.get_column('name')
|
|
dfTemp = dfTemp.drop('name')
|
|
dfTemp = dfTemp.with_columns(sum=pl.sum_horizontal(dfTemp.columns))
|
|
sumCol = dfTemp.get_column('sum')
|
|
|
|
# Create new DF with only property_id, created_at ,Location name and sum
|
|
df = pl.DataFrame([prop, crea, name, sumCol])
|
|
df = df.sort('created_at')
|
|
|
|
# Create Full Copy
|
|
# 0 = Alles
|
|
# 1 = Heidiland
|
|
# 2 = Davos
|
|
# 3 = Engadin
|
|
# 4 = St. Moritz
|
|
filterList = ['Alle Regionen', 'Heidiland', 'Davos', 'Engadin', 'St. Moritz']
|
|
|
|
filter = 4
|
|
if filter != 0:
|
|
df = df.filter(pl.col("name") == filter)
|
|
|
|
# Remove Location name
|
|
df = df.drop('name')
|
|
|
|
|
|
# Get unique property_ids
|
|
propsIDs = df.unique(subset=["property_id"])
|
|
propsIDs = propsIDs.get_column("property_id").to_list()
|
|
propsIDs.sort()
|
|
|
|
# create Matrix
|
|
matrix = []
|
|
for id in propsIDs:
|
|
dict = {}
|
|
temp = df.filter(pl.col("property_id") == id)
|
|
for row in temp.iter_rows():
|
|
dict[row[1].strftime('%Y-%m-%d')] = row[2]
|
|
matrix.append(dict)
|
|
|
|
matrix = pl.DataFrame(matrix)
|
|
dates = matrix.columns
|
|
matrix = matrix.to_numpy()
|
|
# normalized
|
|
matrix = matrix/1111
|
|
|
|
|
|
yRange = range(len(dates))
|
|
xRange = range(len(propsIDs))
|
|
matrix = matrix.T
|
|
plt.imshow(matrix)
|
|
plt.yticks(yRange[::5], dates[::5])
|
|
plt.xticks(xRange[::10], propsIDs[::10])
|
|
plt.title(filterList[filter])
|
|
plt.xlabel("Property ID")
|
|
plt.ylabel("Scrape Date")
|
|
plt.colorbar()
|
|
plt.tight_layout()
|
|
|
|
# Create DiffMatrix
|
|
diffMatrix = np.zeros((len(matrix)-1, len(matrix[0])))
|
|
for y in range(len(matrix[0])):
|
|
for x in range(len(matrix)-1):
|
|
diffMatrix[x][y] = abs(matrix[x][y] - matrix[x+1][y])
|
|
|
|
plt.figure()
|
|
plt.imshow(diffMatrix, cmap="Reds")
|
|
plt.yticks(yRange[::5], dates[::5])
|
|
plt.xticks(xRange[::10], propsIDs[::10])
|
|
plt.title(filterList[filter])
|
|
plt.xlabel("Property ID")
|
|
plt.ylabel("Scrape Date")
|
|
plt.colorbar()
|
|
plt.tight_layout()
|
|
|
|
plt.show()
|
|
|
|
|