data_quality.py erstellt zur Visualisierung der Datenqualität

main
mmaurostoffel 2024-12-11 01:01:52 +01:00
parent 338d3e9cc2
commit 125250a665
4 changed files with 58274 additions and 4 deletions

View File

@ -221,6 +221,7 @@ class Database:
property_id property_id
""") """)
def extractions_for(self, property_id): def extractions_for(self, property_id):
return self.connection.sql(f""" return self.connection.sql(f"""
SELECT SELECT

83
src/mauro/data_quality.py Normal file
View File

@ -0,0 +1,83 @@
import data
import json
import polars as pl
from datetime import datetime
import matplotlib.pyplot as plt
import numpy as np
'''
# Get Data from DB
inst = data.load()
df = inst.extractions().pl()
print(df)
counter = 0
data = []
for row in df.iter_rows():
property_id = row[1]
created_at = row[2].date()
dict = {'property_id': property_id, 'created_at': created_at}
jsonStr = row[0]
if jsonStr:
calendarDict = json.loads(jsonStr)
for key in calendarDict:
dict[key] = calendarDict[key]
data.append(dict)
dfNew = pl.from_dicts(data)
dfNew.write_csv('results/data_quality.csv')
print(dfNew)
'''
dfNew = pl.read_csv('results/data_quality.csv')
dfNew = dfNew.with_columns(pl.col("created_at").map_elements(lambda x: datetime.strptime(x, "%Y-%m-%d").date()))
# Create Row Means
dfTemp = dfNew
# Temporary Remove leading columns but save for later
prop = dfTemp.get_column('property_id')
dfTemp = dfTemp.drop('property_id')
crea = dfTemp.get_column('created_at')
dfTemp = dfTemp.drop('created_at')
dfTemp = dfTemp.with_columns(sum=pl.sum_horizontal(dfTemp.columns))
sumCol = dfTemp.get_column('sum')
# Create new DF with only property_id, created_at and sum
df = pl.DataFrame([prop, crea, sumCol])
# Get unique property_ids
propsIDs = df.unique(subset=["property_id"])
propsIDs = propsIDs.get_column("property_id").to_list()
propsIDs.sort()
# create Matrix
matrix = []
for id in propsIDs:
dict = {}
temp = df.filter(pl.col("property_id") == id)
for row in temp.iter_rows():
dict[row[1].strftime('%Y-%m-%d')] = row[2]
matrix.append(dict)
matrix = pl.DataFrame(matrix)
dates = matrix.columns
matrix = matrix.to_numpy()
yRange = range(len(dates))
matrix = matrix.T
plt.imshow(matrix)
plt.yticks(yRange[::5], dates[::5])
# Create DiffMatrix
diffMatrix = np.zeros((len(matrix)-1, len(matrix[0])))
for y in range(len(matrix[0])):
for x in range(len(matrix)-1):
diffMatrix[x][y] = abs(matrix[x][y] - matrix[x+1][y])
plt.figure()
plt.imshow(diffMatrix)
plt.yticks(yRange[::5], dates[::5])
plt.show()

File diff suppressed because one or more lines are too long

View File

@ -106,7 +106,7 @@ heidi = merge.filter(pl.col("seed_id") == 1)
print("Heidiland meanPreorderTime = ", round(heidi.get_column("meanPreorderScrapeNum").mean()*3,2)) print("Heidiland meanPreorderTime = ", round(heidi.get_column("meanPreorderScrapeNum").mean()*3,2))
# 2 = Davos # 2 = Davos
Davos = merge.filter(pl.col("seed_id") == 2) Davos = merge.filter(pl.col("seed_id") == 2)
print("Davos meanPreorderTime = ",round(Davos.get_column("meanPreorderScrapeNum").mean()*3,2)) print("Davos meanPreorderTime = ", (Davos.get_column("meanPreorderScrapeNum").mean()*3,2))
# 3 = Engadin # 3 = Engadin
Engadin = merge.filter(pl.col("seed_id") == 3) Engadin = merge.filter(pl.col("seed_id") == 3)
print("Engadin meanPreorderTime = ", round(Engadin.get_column("meanPreorderScrapeNum").mean()*3,2)) print("Engadin meanPreorderTime = ", round(Engadin.get_column("meanPreorderScrapeNum").mean()*3,2))