From eba2f0a265cec22a5fa7c0dedbc45f3a30499976 Mon Sep 17 00:00:00 2001 From: mmaurostoffel <166130318+mmaurostoffel@users.noreply.github.com> Date: Thu, 19 Dec 2024 18:11:15 +0100 Subject: [PATCH] Data Quality updated to include Regions and more information --- etl/src/data/__init__.py | 2 +- etl/src/data/database.py | 17 +++++++++++ etl/src/mauro/data_quality.py | 50 +++++++++++++++++++++++++++----- etl/src/mauro/vorbuchungsZeit.py | 4 +-- 4 files changed, 63 insertions(+), 10 deletions(-) diff --git a/etl/src/data/__init__.py b/etl/src/data/__init__.py index d1d5148..7c93c43 100644 --- a/etl/src/data/__init__.py +++ b/etl/src/data/__init__.py @@ -2,7 +2,7 @@ import os from dotenv import load_dotenv -from data import database +from etl.src.data import database dirname = os.path.dirname(__file__) envfile = os.path.join(dirname, '../.env') diff --git a/etl/src/data/database.py b/etl/src/data/database.py index b4d7c3b..ea340a0 100644 --- a/etl/src/data/database.py +++ b/etl/src/data/database.py @@ -221,6 +221,23 @@ class Database: property_id """) + def extractions_with_region(self): + return self.connection.sql(""" + SELECT + JSON_EXTRACT(body, '$.content.days') as calendar, + extractions.property_id, + extractions.created_at, + properties.seed_id, + regions.name + FROM + consultancy_d.extractions + LEFT JOIN + consultancy_d.properties ON properties.id = extractions.property_id + LEFT JOIN + consultancy_d.seeds ON seeds.id = properties.seed_id + LEFT JOIN + consultancy_d.regions ON regions.id = seeds.region_id + """) def extractions_for(self, property_id): return self.connection.sql(f""" diff --git a/etl/src/mauro/data_quality.py b/etl/src/mauro/data_quality.py index 9036a2c..0530133 100644 --- a/etl/src/mauro/data_quality.py +++ b/etl/src/mauro/data_quality.py @@ -1,4 +1,4 @@ -import data +from etl.src import data import json import polars as pl from datetime import datetime @@ -9,7 +9,7 @@ import numpy as np # Get Data from DB inst = data.load() -df = inst.extractions().pl() +df = inst.extractions_with_region().pl() print(df) counter = 0 @@ -17,7 +17,7 @@ data = [] for row in df.iter_rows(): property_id = row[1] created_at = row[2].date() - dict = {'property_id': property_id, 'created_at': created_at} + dict = {'property_id': property_id, 'created_at': created_at, 'name': row[3]} jsonStr = row[0] if jsonStr: @@ -30,8 +30,10 @@ for row in df.iter_rows(): dfNew = pl.from_dicts(data) dfNew.write_csv('results/data_quality.csv') print(dfNew) -''' + + +''' dfNew = pl.read_csv('results/data_quality.csv') dfNew = dfNew.with_columns(pl.col("created_at").map_elements(lambda x: datetime.strptime(x, "%Y-%m-%d").date())) @@ -42,11 +44,30 @@ prop = dfTemp.get_column('property_id') dfTemp = dfTemp.drop('property_id') crea = dfTemp.get_column('created_at') dfTemp = dfTemp.drop('created_at') +name = dfTemp.get_column('name') +dfTemp = dfTemp.drop('name') dfTemp = dfTemp.with_columns(sum=pl.sum_horizontal(dfTemp.columns)) sumCol = dfTemp.get_column('sum') -# Create new DF with only property_id, created_at and sum -df = pl.DataFrame([prop, crea, sumCol]) +# Create new DF with only property_id, created_at ,Location name and sum +df = pl.DataFrame([prop, crea, name, sumCol]) +df = df.sort('created_at') + +# Create Full Copy +# 0 = Alles +# 1 = Heidiland +# 2 = Davos +# 3 = Engadin +# 4 = St. Moritz +filterList = ['Alle Regionen', 'Heidiland', 'Davos', 'Engadin', 'St. Moritz'] + +filter = 4 +if filter != 0: + df = df.filter(pl.col("name") == filter) + +# Remove Location name +df = df.drop('name') + # Get unique property_ids propsIDs = df.unique(subset=["property_id"]) @@ -65,13 +86,21 @@ for id in propsIDs: matrix = pl.DataFrame(matrix) dates = matrix.columns matrix = matrix.to_numpy() +# normalized +matrix = matrix/1111 + yRange = range(len(dates)) +xRange = range(len(propsIDs)) matrix = matrix.T plt.imshow(matrix) plt.yticks(yRange[::5], dates[::5]) +plt.xticks(xRange[::10], propsIDs[::10]) +plt.title(filterList[filter]) plt.xlabel("Property ID") plt.ylabel("Scrape Date") +plt.colorbar() +plt.tight_layout() # Create DiffMatrix diffMatrix = np.zeros((len(matrix)-1, len(matrix[0]))) @@ -82,6 +111,13 @@ for y in range(len(matrix[0])): plt.figure() plt.imshow(diffMatrix, cmap="Reds") plt.yticks(yRange[::5], dates[::5]) +plt.xticks(xRange[::10], propsIDs[::10]) +plt.title(filterList[filter]) plt.xlabel("Property ID") plt.ylabel("Scrape Date") -plt.show() \ No newline at end of file +plt.colorbar() +plt.tight_layout() + +plt.show() + + diff --git a/etl/src/mauro/vorbuchungsZeit.py b/etl/src/mauro/vorbuchungsZeit.py index 8baef3f..026f920 100644 --- a/etl/src/mauro/vorbuchungsZeit.py +++ b/etl/src/mauro/vorbuchungsZeit.py @@ -1,5 +1,5 @@ -import data -from data import etl_pipelines as ep +from etl.src import data +from etl.src.data import etl_pipelines as ep import polars as pl from datetime import datetime, timedelta import pandas as pd