data_quality.py erstellt zur Visualisierung der Datenqualität
parent
338d3e9cc2
commit
125250a665
|
@ -221,6 +221,7 @@ class Database:
|
|||
property_id
|
||||
""")
|
||||
|
||||
|
||||
def extractions_for(self, property_id):
|
||||
return self.connection.sql(f"""
|
||||
SELECT
|
||||
|
|
|
@ -0,0 +1,83 @@
|
|||
import data
|
||||
import json
|
||||
import polars as pl
|
||||
from datetime import datetime
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
|
||||
'''
|
||||
# Get Data from DB
|
||||
inst = data.load()
|
||||
|
||||
df = inst.extractions().pl()
|
||||
print(df)
|
||||
|
||||
counter = 0
|
||||
data = []
|
||||
for row in df.iter_rows():
|
||||
property_id = row[1]
|
||||
created_at = row[2].date()
|
||||
dict = {'property_id': property_id, 'created_at': created_at}
|
||||
|
||||
jsonStr = row[0]
|
||||
if jsonStr:
|
||||
calendarDict = json.loads(jsonStr)
|
||||
for key in calendarDict:
|
||||
dict[key] = calendarDict[key]
|
||||
|
||||
data.append(dict)
|
||||
|
||||
dfNew = pl.from_dicts(data)
|
||||
dfNew.write_csv('results/data_quality.csv')
|
||||
print(dfNew)
|
||||
'''
|
||||
|
||||
dfNew = pl.read_csv('results/data_quality.csv')
|
||||
dfNew = dfNew.with_columns(pl.col("created_at").map_elements(lambda x: datetime.strptime(x, "%Y-%m-%d").date()))
|
||||
|
||||
# Create Row Means
|
||||
dfTemp = dfNew
|
||||
# Temporary Remove leading columns but save for later
|
||||
prop = dfTemp.get_column('property_id')
|
||||
dfTemp = dfTemp.drop('property_id')
|
||||
crea = dfTemp.get_column('created_at')
|
||||
dfTemp = dfTemp.drop('created_at')
|
||||
dfTemp = dfTemp.with_columns(sum=pl.sum_horizontal(dfTemp.columns))
|
||||
sumCol = dfTemp.get_column('sum')
|
||||
|
||||
# Create new DF with only property_id, created_at and sum
|
||||
df = pl.DataFrame([prop, crea, sumCol])
|
||||
|
||||
# Get unique property_ids
|
||||
propsIDs = df.unique(subset=["property_id"])
|
||||
propsIDs = propsIDs.get_column("property_id").to_list()
|
||||
propsIDs.sort()
|
||||
|
||||
# create Matrix
|
||||
matrix = []
|
||||
for id in propsIDs:
|
||||
dict = {}
|
||||
temp = df.filter(pl.col("property_id") == id)
|
||||
for row in temp.iter_rows():
|
||||
dict[row[1].strftime('%Y-%m-%d')] = row[2]
|
||||
matrix.append(dict)
|
||||
|
||||
matrix = pl.DataFrame(matrix)
|
||||
dates = matrix.columns
|
||||
matrix = matrix.to_numpy()
|
||||
|
||||
yRange = range(len(dates))
|
||||
matrix = matrix.T
|
||||
plt.imshow(matrix)
|
||||
plt.yticks(yRange[::5], dates[::5])
|
||||
|
||||
# Create DiffMatrix
|
||||
diffMatrix = np.zeros((len(matrix)-1, len(matrix[0])))
|
||||
for y in range(len(matrix[0])):
|
||||
for x in range(len(matrix)-1):
|
||||
diffMatrix[x][y] = abs(matrix[x][y] - matrix[x+1][y])
|
||||
|
||||
plt.figure()
|
||||
plt.imshow(diffMatrix)
|
||||
plt.yticks(yRange[::5], dates[::5])
|
||||
plt.show()
|
File diff suppressed because one or more lines are too long
|
@ -103,13 +103,13 @@ print("Global meanPreorderTime = ",round(merge.get_column("meanPreorderScrapeNum
|
|||
|
||||
# 1 = Heidiland
|
||||
heidi = merge.filter(pl.col("seed_id") == 1)
|
||||
print("Heidiland meanPreorderTime = ",round(heidi.get_column("meanPreorderScrapeNum").mean()*3,2))
|
||||
print("Heidiland meanPreorderTime = ", round(heidi.get_column("meanPreorderScrapeNum").mean()*3,2))
|
||||
# 2 = Davos
|
||||
Davos = merge.filter(pl.col("seed_id") == 2)
|
||||
print("Davos meanPreorderTime = ",round(Davos.get_column("meanPreorderScrapeNum").mean()*3,2))
|
||||
print("Davos meanPreorderTime = ", (Davos.get_column("meanPreorderScrapeNum").mean()*3,2))
|
||||
# 3 = Engadin
|
||||
Engadin = merge.filter(pl.col("seed_id") == 3)
|
||||
print("Engadin meanPreorderTime = ",round(Engadin.get_column("meanPreorderScrapeNum").mean()*3,2))
|
||||
print("Engadin meanPreorderTime = ", round(Engadin.get_column("meanPreorderScrapeNum").mean()*3,2))
|
||||
# 4 = St. Moritz
|
||||
Moritz = merge.filter(pl.col("seed_id") == 4)
|
||||
print("St. Moritz meanPreorderTime = ",round(Moritz.get_column("meanPreorderScrapeNum").mean()*3,2))
|
||||
print("St. Moritz meanPreorderTime = ", round(Moritz.get_column("meanPreorderScrapeNum").mean()*3,2))
|
Loading…
Reference in New Issue