import data import json import polars as pl from datetime import datetime import matplotlib.pyplot as plt import numpy as np ''' # Get Data from DB inst = data.load() df = inst.extractions().pl() print(df) counter = 0 data = [] for row in df.iter_rows(): property_id = row[1] created_at = row[2].date() dict = {'property_id': property_id, 'created_at': created_at} jsonStr = row[0] if jsonStr: calendarDict = json.loads(jsonStr) for key in calendarDict: dict[key] = calendarDict[key] data.append(dict) dfNew = pl.from_dicts(data) dfNew.write_csv('results/data_quality.csv') print(dfNew) ''' dfNew = pl.read_csv('results/data_quality.csv') dfNew = dfNew.with_columns(pl.col("created_at").map_elements(lambda x: datetime.strptime(x, "%Y-%m-%d").date())) # Create Row Means dfTemp = dfNew # Temporary Remove leading columns but save for later prop = dfTemp.get_column('property_id') dfTemp = dfTemp.drop('property_id') crea = dfTemp.get_column('created_at') dfTemp = dfTemp.drop('created_at') dfTemp = dfTemp.with_columns(sum=pl.sum_horizontal(dfTemp.columns)) sumCol = dfTemp.get_column('sum') # Create new DF with only property_id, created_at and sum df = pl.DataFrame([prop, crea, sumCol]) # Get unique property_ids propsIDs = df.unique(subset=["property_id"]) propsIDs = propsIDs.get_column("property_id").to_list() propsIDs.sort() # create Matrix matrix = [] for id in propsIDs: dict = {} temp = df.filter(pl.col("property_id") == id) for row in temp.iter_rows(): dict[row[1].strftime('%Y-%m-%d')] = row[2] matrix.append(dict) matrix = pl.DataFrame(matrix) dates = matrix.columns matrix = matrix.to_numpy() yRange = range(len(dates)) matrix = matrix.T plt.imshow(matrix) plt.yticks(yRange[::5], dates[::5]) # Create DiffMatrix diffMatrix = np.zeros((len(matrix)-1, len(matrix[0]))) for y in range(len(matrix[0])): for x in range(len(matrix)-1): diffMatrix[x][y] = abs(matrix[x][y] - matrix[x+1][y]) plt.figure() plt.imshow(diffMatrix) plt.yticks(yRange[::5], dates[::5]) plt.show()