Data Quality updated to include Regions and more information

main
mmaurostoffel 2024-12-19 18:11:15 +01:00
parent a03ce3d647
commit eba2f0a265
4 changed files with 63 additions and 10 deletions

View File

@ -2,7 +2,7 @@ import os
from dotenv import load_dotenv
from data import database
from etl.src.data import database
dirname = os.path.dirname(__file__)
envfile = os.path.join(dirname, '../.env')

View File

@ -221,6 +221,23 @@ class Database:
property_id
""")
def extractions_with_region(self):
return self.connection.sql("""
SELECT
JSON_EXTRACT(body, '$.content.days') as calendar,
extractions.property_id,
extractions.created_at,
properties.seed_id,
regions.name
FROM
consultancy_d.extractions
LEFT JOIN
consultancy_d.properties ON properties.id = extractions.property_id
LEFT JOIN
consultancy_d.seeds ON seeds.id = properties.seed_id
LEFT JOIN
consultancy_d.regions ON regions.id = seeds.region_id
""")
def extractions_for(self, property_id):
return self.connection.sql(f"""

View File

@ -1,4 +1,4 @@
import data
from etl.src import data
import json
import polars as pl
from datetime import datetime
@ -9,7 +9,7 @@ import numpy as np
# Get Data from DB
inst = data.load()
df = inst.extractions().pl()
df = inst.extractions_with_region().pl()
print(df)
counter = 0
@ -17,7 +17,7 @@ data = []
for row in df.iter_rows():
property_id = row[1]
created_at = row[2].date()
dict = {'property_id': property_id, 'created_at': created_at}
dict = {'property_id': property_id, 'created_at': created_at, 'name': row[3]}
jsonStr = row[0]
if jsonStr:
@ -30,8 +30,10 @@ for row in df.iter_rows():
dfNew = pl.from_dicts(data)
dfNew.write_csv('results/data_quality.csv')
print(dfNew)
'''
'''
dfNew = pl.read_csv('results/data_quality.csv')
dfNew = dfNew.with_columns(pl.col("created_at").map_elements(lambda x: datetime.strptime(x, "%Y-%m-%d").date()))
@ -42,11 +44,30 @@ prop = dfTemp.get_column('property_id')
dfTemp = dfTemp.drop('property_id')
crea = dfTemp.get_column('created_at')
dfTemp = dfTemp.drop('created_at')
name = dfTemp.get_column('name')
dfTemp = dfTemp.drop('name')
dfTemp = dfTemp.with_columns(sum=pl.sum_horizontal(dfTemp.columns))
sumCol = dfTemp.get_column('sum')
# Create new DF with only property_id, created_at and sum
df = pl.DataFrame([prop, crea, sumCol])
# Create new DF with only property_id, created_at ,Location name and sum
df = pl.DataFrame([prop, crea, name, sumCol])
df = df.sort('created_at')
# Create Full Copy
# 0 = Alles
# 1 = Heidiland
# 2 = Davos
# 3 = Engadin
# 4 = St. Moritz
filterList = ['Alle Regionen', 'Heidiland', 'Davos', 'Engadin', 'St. Moritz']
filter = 4
if filter != 0:
df = df.filter(pl.col("name") == filter)
# Remove Location name
df = df.drop('name')
# Get unique property_ids
propsIDs = df.unique(subset=["property_id"])
@ -65,13 +86,21 @@ for id in propsIDs:
matrix = pl.DataFrame(matrix)
dates = matrix.columns
matrix = matrix.to_numpy()
# normalized
matrix = matrix/1111
yRange = range(len(dates))
xRange = range(len(propsIDs))
matrix = matrix.T
plt.imshow(matrix)
plt.yticks(yRange[::5], dates[::5])
plt.xticks(xRange[::10], propsIDs[::10])
plt.title(filterList[filter])
plt.xlabel("Property ID")
plt.ylabel("Scrape Date")
plt.colorbar()
plt.tight_layout()
# Create DiffMatrix
diffMatrix = np.zeros((len(matrix)-1, len(matrix[0])))
@ -82,6 +111,13 @@ for y in range(len(matrix[0])):
plt.figure()
plt.imshow(diffMatrix, cmap="Reds")
plt.yticks(yRange[::5], dates[::5])
plt.xticks(xRange[::10], propsIDs[::10])
plt.title(filterList[filter])
plt.xlabel("Property ID")
plt.ylabel("Scrape Date")
plt.colorbar()
plt.tight_layout()
plt.show()

View File

@ -1,5 +1,5 @@
import data
from data import etl_pipelines as ep
from etl.src import data
from etl.src.data import etl_pipelines as ep
import polars as pl
from datetime import datetime, timedelta
import pandas as pd