Data Quality updated to include Regions and more information
This commit is contained in:
		
							parent
							
								
									a03ce3d647
								
							
						
					
					
						commit
						eba2f0a265
					
				@ -2,7 +2,7 @@ import os
 | 
			
		||||
 | 
			
		||||
from dotenv import load_dotenv
 | 
			
		||||
 | 
			
		||||
from data import database
 | 
			
		||||
from etl.src.data import database
 | 
			
		||||
 | 
			
		||||
dirname = os.path.dirname(__file__)
 | 
			
		||||
envfile = os.path.join(dirname, '../.env')
 | 
			
		||||
 | 
			
		||||
@ -221,6 +221,23 @@ class Database:
 | 
			
		||||
				property_id
 | 
			
		||||
		""")
 | 
			
		||||
 | 
			
		||||
	def extractions_with_region(self):
 | 
			
		||||
		return self.connection.sql("""
 | 
			
		||||
			SELECT 
 | 
			
		||||
				JSON_EXTRACT(body, '$.content.days') as calendar, 
 | 
			
		||||
				extractions.property_id, 
 | 
			
		||||
				extractions.created_at,
 | 
			
		||||
				properties.seed_id, 
 | 
			
		||||
				regions.name
 | 
			
		||||
			FROM 
 | 
			
		||||
				consultancy_d.extractions 
 | 
			
		||||
			LEFT JOIN 
 | 
			
		||||
				consultancy_d.properties ON properties.id = extractions.property_id 
 | 
			
		||||
			LEFT JOIN 
 | 
			
		||||
				consultancy_d.seeds ON seeds.id = properties.seed_id 
 | 
			
		||||
			LEFT JOIN 
 | 
			
		||||
				consultancy_d.regions ON regions.id = seeds.region_id 
 | 
			
		||||
			""")
 | 
			
		||||
 | 
			
		||||
	def extractions_for(self, property_id):
 | 
			
		||||
		return self.connection.sql(f"""
 | 
			
		||||
 | 
			
		||||
@ -1,4 +1,4 @@
 | 
			
		||||
import data
 | 
			
		||||
from etl.src import data
 | 
			
		||||
import json
 | 
			
		||||
import polars as pl
 | 
			
		||||
from datetime import datetime
 | 
			
		||||
@ -9,7 +9,7 @@ import numpy as np
 | 
			
		||||
# Get Data from DB
 | 
			
		||||
inst = data.load()
 | 
			
		||||
 | 
			
		||||
df = inst.extractions().pl()
 | 
			
		||||
df = inst.extractions_with_region().pl()
 | 
			
		||||
print(df)
 | 
			
		||||
 | 
			
		||||
counter = 0
 | 
			
		||||
@ -17,7 +17,7 @@ data = []
 | 
			
		||||
for row in df.iter_rows():
 | 
			
		||||
    property_id = row[1]
 | 
			
		||||
    created_at = row[2].date()
 | 
			
		||||
    dict = {'property_id': property_id, 'created_at': created_at}
 | 
			
		||||
    dict = {'property_id': property_id, 'created_at': created_at, 'name': row[3]}
 | 
			
		||||
 | 
			
		||||
    jsonStr = row[0]
 | 
			
		||||
    if jsonStr:
 | 
			
		||||
@ -30,8 +30,10 @@ for row in df.iter_rows():
 | 
			
		||||
dfNew = pl.from_dicts(data)
 | 
			
		||||
dfNew.write_csv('results/data_quality.csv')
 | 
			
		||||
print(dfNew)
 | 
			
		||||
'''
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
'''
 | 
			
		||||
dfNew = pl.read_csv('results/data_quality.csv')
 | 
			
		||||
dfNew = dfNew.with_columns(pl.col("created_at").map_elements(lambda x: datetime.strptime(x, "%Y-%m-%d").date()))
 | 
			
		||||
 | 
			
		||||
@ -42,11 +44,30 @@ prop = dfTemp.get_column('property_id')
 | 
			
		||||
dfTemp = dfTemp.drop('property_id')
 | 
			
		||||
crea = dfTemp.get_column('created_at')
 | 
			
		||||
dfTemp = dfTemp.drop('created_at')
 | 
			
		||||
name = dfTemp.get_column('name')
 | 
			
		||||
dfTemp = dfTemp.drop('name')
 | 
			
		||||
dfTemp = dfTemp.with_columns(sum=pl.sum_horizontal(dfTemp.columns))
 | 
			
		||||
sumCol = dfTemp.get_column('sum')
 | 
			
		||||
 | 
			
		||||
# Create new DF with only property_id, created_at and sum
 | 
			
		||||
df = pl.DataFrame([prop, crea, sumCol])
 | 
			
		||||
# Create new DF with only property_id, created_at ,Location name and sum
 | 
			
		||||
df = pl.DataFrame([prop, crea, name, sumCol])
 | 
			
		||||
df = df.sort('created_at')
 | 
			
		||||
 | 
			
		||||
# Create Full Copy
 | 
			
		||||
# 0 = Alles
 | 
			
		||||
# 1 = Heidiland
 | 
			
		||||
# 2 = Davos
 | 
			
		||||
# 3 = Engadin
 | 
			
		||||
# 4 = St. Moritz
 | 
			
		||||
filterList = ['Alle Regionen', 'Heidiland', 'Davos', 'Engadin', 'St. Moritz']
 | 
			
		||||
 | 
			
		||||
filter = 4
 | 
			
		||||
if filter != 0:
 | 
			
		||||
    df = df.filter(pl.col("name") == filter)
 | 
			
		||||
 | 
			
		||||
# Remove Location name
 | 
			
		||||
df = df.drop('name')
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Get unique property_ids
 | 
			
		||||
propsIDs = df.unique(subset=["property_id"])
 | 
			
		||||
@ -65,13 +86,21 @@ for id in propsIDs:
 | 
			
		||||
matrix = pl.DataFrame(matrix)
 | 
			
		||||
dates = matrix.columns
 | 
			
		||||
matrix = matrix.to_numpy()
 | 
			
		||||
# normalized
 | 
			
		||||
matrix = matrix/1111
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
yRange = range(len(dates))
 | 
			
		||||
xRange = range(len(propsIDs))
 | 
			
		||||
matrix = matrix.T
 | 
			
		||||
plt.imshow(matrix)
 | 
			
		||||
plt.yticks(yRange[::5], dates[::5])
 | 
			
		||||
plt.xticks(xRange[::10], propsIDs[::10])
 | 
			
		||||
plt.title(filterList[filter])
 | 
			
		||||
plt.xlabel("Property ID")
 | 
			
		||||
plt.ylabel("Scrape Date")
 | 
			
		||||
plt.colorbar()
 | 
			
		||||
plt.tight_layout()
 | 
			
		||||
 | 
			
		||||
# Create DiffMatrix
 | 
			
		||||
diffMatrix = np.zeros((len(matrix)-1, len(matrix[0])))
 | 
			
		||||
@ -82,6 +111,13 @@ for y in range(len(matrix[0])):
 | 
			
		||||
plt.figure()
 | 
			
		||||
plt.imshow(diffMatrix, cmap="Reds")
 | 
			
		||||
plt.yticks(yRange[::5], dates[::5])
 | 
			
		||||
plt.xticks(xRange[::10], propsIDs[::10])
 | 
			
		||||
plt.title(filterList[filter])
 | 
			
		||||
plt.xlabel("Property ID")
 | 
			
		||||
plt.ylabel("Scrape Date")
 | 
			
		||||
plt.show()
 | 
			
		||||
plt.colorbar()
 | 
			
		||||
plt.tight_layout()
 | 
			
		||||
 | 
			
		||||
plt.show()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@ -1,5 +1,5 @@
 | 
			
		||||
import data
 | 
			
		||||
from data import etl_pipelines as ep
 | 
			
		||||
from etl.src import data
 | 
			
		||||
from etl.src.data import etl_pipelines as ep
 | 
			
		||||
import polars as pl
 | 
			
		||||
from datetime import datetime, timedelta
 | 
			
		||||
import pandas as pd
 | 
			
		||||
 | 
			
		||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user