from etl.src import data import json import polars as pl from datetime import datetime import matplotlib.pyplot as plt import numpy as np ''' # Get Data from DB inst = data.load() df = inst.extractions_with_region().pl() print(df) counter = 0 data = [] for row in df.iter_rows(): property_id = row[1] created_at = row[2].date() dict = {'property_id': property_id, 'created_at': created_at, 'name': row[3]} jsonStr = row[0] if jsonStr: calendarDict = json.loads(jsonStr) for key in calendarDict: dict[key] = calendarDict[key] data.append(dict) dfNew = pl.from_dicts(data) dfNew.write_csv('results/data_quality.csv') print(dfNew) ''' dfNew = pl.read_csv('results/data_quality.csv') dfNew = dfNew.with_columns(pl.col("created_at").map_elements(lambda x: datetime.strptime(x, "%Y-%m-%d").date())) # Create Row Means dfTemp = dfNew # Temporary Remove leading columns but save for later prop = dfTemp.get_column('property_id') dfTemp = dfTemp.drop('property_id') crea = dfTemp.get_column('created_at') dfTemp = dfTemp.drop('created_at') name = dfTemp.get_column('name') dfTemp = dfTemp.drop('name') dfTemp = dfTemp.with_columns(sum=pl.sum_horizontal(dfTemp.columns)) sumCol = dfTemp.get_column('sum') # Create new DF with only property_id, created_at ,Location name and sum df = pl.DataFrame([prop, crea, name, sumCol]) df = df.sort('created_at') # Create Full Copy # 0 = Alles # 1 = Heidiland # 2 = Davos # 3 = Engadin # 4 = St. Moritz filterList = ['Alle Regionen', 'Heidiland', 'Davos', 'Engadin', 'St. Moritz'] filter = 4 if filter != 0: df = df.filter(pl.col("name") == filter) # Remove Location name df = df.drop('name') # Get unique property_ids propsIDs = df.unique(subset=["property_id"]) propsIDs = propsIDs.get_column("property_id").to_list() propsIDs.sort() # create Matrix matrix = [] for id in propsIDs: dict = {} temp = df.filter(pl.col("property_id") == id) for row in temp.iter_rows(): dict[row[1].strftime('%Y-%m-%d')] = row[2] matrix.append(dict) matrix = pl.DataFrame(matrix) dates = matrix.columns matrix = matrix.to_numpy() # normalized matrix = matrix/1111 yRange = range(len(dates)) xRange = range(len(propsIDs)) matrix = matrix.T plt.imshow(matrix) plt.yticks(yRange[::5], dates[::5]) plt.xticks(xRange[::10], propsIDs[::10]) plt.title(filterList[filter]) plt.xlabel("Property ID") plt.ylabel("Scrape Date") plt.colorbar() plt.tight_layout() # Create DiffMatrix diffMatrix = np.zeros((len(matrix)-1, len(matrix[0]))) for y in range(len(matrix[0])): for x in range(len(matrix)-1): diffMatrix[x][y] = abs(matrix[x][y] - matrix[x+1][y]) plt.figure() plt.imshow(diffMatrix, cmap="Reds") plt.yticks(yRange[::5], dates[::5]) plt.xticks(xRange[::10], propsIDs[::10]) plt.title(filterList[filter]) plt.xlabel("Property ID") plt.ylabel("Scrape Date") plt.colorbar() plt.tight_layout() plt.show()