Compare commits
No commits in common. "4b7067fb635de32e78fc4f07713630818c873a3d" and "ce46655003bb3168e543bf860743ca1517f228c8" have entirely different histories.
4b7067fb63
...
ce46655003
|
@ -2,7 +2,7 @@ import os
|
||||||
|
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
from etl.src.data import database
|
from data import database
|
||||||
|
|
||||||
dirname = os.path.dirname(__file__)
|
dirname = os.path.dirname(__file__)
|
||||||
envfile = os.path.join(dirname, '../.env')
|
envfile = os.path.join(dirname, '../.env')
|
||||||
|
|
|
@ -303,23 +303,6 @@ class Database:
|
||||||
property_id
|
property_id
|
||||||
""")
|
""")
|
||||||
|
|
||||||
def extractions_with_region(self):
|
|
||||||
return self.connection.sql("""
|
|
||||||
SELECT
|
|
||||||
JSON_EXTRACT(body, '$.content.days') as calendar,
|
|
||||||
extractions.property_id,
|
|
||||||
extractions.created_at,
|
|
||||||
properties.seed_id,
|
|
||||||
regions.name
|
|
||||||
FROM
|
|
||||||
consultancy_d.extractions
|
|
||||||
LEFT JOIN
|
|
||||||
consultancy_d.properties ON properties.id = extractions.property_id
|
|
||||||
LEFT JOIN
|
|
||||||
consultancy_d.seeds ON seeds.id = properties.seed_id
|
|
||||||
LEFT JOIN
|
|
||||||
consultancy_d.regions ON regions.id = seeds.region_id
|
|
||||||
""")
|
|
||||||
|
|
||||||
def extractions_for(self, property_id):
|
def extractions_for(self, property_id):
|
||||||
return self.connection.sql(f"""
|
return self.connection.sql(f"""
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from etl.src import data
|
import data
|
||||||
import json
|
import json
|
||||||
import polars as pl
|
import polars as pl
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
@ -9,7 +9,7 @@ import numpy as np
|
||||||
# Get Data from DB
|
# Get Data from DB
|
||||||
inst = data.load()
|
inst = data.load()
|
||||||
|
|
||||||
df = inst.extractions_with_region().pl()
|
df = inst.extractions().pl()
|
||||||
print(df)
|
print(df)
|
||||||
|
|
||||||
counter = 0
|
counter = 0
|
||||||
|
@ -17,7 +17,7 @@ data = []
|
||||||
for row in df.iter_rows():
|
for row in df.iter_rows():
|
||||||
property_id = row[1]
|
property_id = row[1]
|
||||||
created_at = row[2].date()
|
created_at = row[2].date()
|
||||||
dict = {'property_id': property_id, 'created_at': created_at, 'name': row[3]}
|
dict = {'property_id': property_id, 'created_at': created_at}
|
||||||
|
|
||||||
jsonStr = row[0]
|
jsonStr = row[0]
|
||||||
if jsonStr:
|
if jsonStr:
|
||||||
|
@ -30,10 +30,8 @@ for row in df.iter_rows():
|
||||||
dfNew = pl.from_dicts(data)
|
dfNew = pl.from_dicts(data)
|
||||||
dfNew.write_csv('results/data_quality.csv')
|
dfNew.write_csv('results/data_quality.csv')
|
||||||
print(dfNew)
|
print(dfNew)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
'''
|
'''
|
||||||
|
|
||||||
dfNew = pl.read_csv('results/data_quality.csv')
|
dfNew = pl.read_csv('results/data_quality.csv')
|
||||||
dfNew = dfNew.with_columns(pl.col("created_at").map_elements(lambda x: datetime.strptime(x, "%Y-%m-%d").date()))
|
dfNew = dfNew.with_columns(pl.col("created_at").map_elements(lambda x: datetime.strptime(x, "%Y-%m-%d").date()))
|
||||||
|
|
||||||
|
@ -44,30 +42,11 @@ prop = dfTemp.get_column('property_id')
|
||||||
dfTemp = dfTemp.drop('property_id')
|
dfTemp = dfTemp.drop('property_id')
|
||||||
crea = dfTemp.get_column('created_at')
|
crea = dfTemp.get_column('created_at')
|
||||||
dfTemp = dfTemp.drop('created_at')
|
dfTemp = dfTemp.drop('created_at')
|
||||||
name = dfTemp.get_column('name')
|
|
||||||
dfTemp = dfTemp.drop('name')
|
|
||||||
dfTemp = dfTemp.with_columns(sum=pl.sum_horizontal(dfTemp.columns))
|
dfTemp = dfTemp.with_columns(sum=pl.sum_horizontal(dfTemp.columns))
|
||||||
sumCol = dfTemp.get_column('sum')
|
sumCol = dfTemp.get_column('sum')
|
||||||
|
|
||||||
# Create new DF with only property_id, created_at ,Location name and sum
|
# Create new DF with only property_id, created_at and sum
|
||||||
df = pl.DataFrame([prop, crea, name, sumCol])
|
df = pl.DataFrame([prop, crea, sumCol])
|
||||||
df = df.sort('created_at')
|
|
||||||
|
|
||||||
# Create Full Copy
|
|
||||||
# 0 = Alles
|
|
||||||
# 1 = Heidiland
|
|
||||||
# 2 = Davos
|
|
||||||
# 3 = Engadin
|
|
||||||
# 4 = St. Moritz
|
|
||||||
filterList = ['Alle Regionen', 'Heidiland', 'Davos', 'Engadin', 'St. Moritz']
|
|
||||||
|
|
||||||
filter = 4
|
|
||||||
if filter != 0:
|
|
||||||
df = df.filter(pl.col("name") == filter)
|
|
||||||
|
|
||||||
# Remove Location name
|
|
||||||
df = df.drop('name')
|
|
||||||
|
|
||||||
|
|
||||||
# Get unique property_ids
|
# Get unique property_ids
|
||||||
propsIDs = df.unique(subset=["property_id"])
|
propsIDs = df.unique(subset=["property_id"])
|
||||||
|
@ -86,21 +65,13 @@ for id in propsIDs:
|
||||||
matrix = pl.DataFrame(matrix)
|
matrix = pl.DataFrame(matrix)
|
||||||
dates = matrix.columns
|
dates = matrix.columns
|
||||||
matrix = matrix.to_numpy()
|
matrix = matrix.to_numpy()
|
||||||
# normalized
|
|
||||||
matrix = matrix/1111
|
|
||||||
|
|
||||||
|
|
||||||
yRange = range(len(dates))
|
yRange = range(len(dates))
|
||||||
xRange = range(len(propsIDs))
|
|
||||||
matrix = matrix.T
|
matrix = matrix.T
|
||||||
plt.imshow(matrix)
|
plt.imshow(matrix)
|
||||||
plt.yticks(yRange[::5], dates[::5])
|
plt.yticks(yRange[::5], dates[::5])
|
||||||
plt.xticks(xRange[::10], propsIDs[::10])
|
|
||||||
plt.title(filterList[filter])
|
|
||||||
plt.xlabel("Property ID")
|
plt.xlabel("Property ID")
|
||||||
plt.ylabel("Scrape Date")
|
plt.ylabel("Scrape Date")
|
||||||
plt.colorbar()
|
|
||||||
plt.tight_layout()
|
|
||||||
|
|
||||||
# Create DiffMatrix
|
# Create DiffMatrix
|
||||||
diffMatrix = np.zeros((len(matrix)-1, len(matrix[0])))
|
diffMatrix = np.zeros((len(matrix)-1, len(matrix[0])))
|
||||||
|
@ -111,13 +82,6 @@ for y in range(len(matrix[0])):
|
||||||
plt.figure()
|
plt.figure()
|
||||||
plt.imshow(diffMatrix, cmap="Reds")
|
plt.imshow(diffMatrix, cmap="Reds")
|
||||||
plt.yticks(yRange[::5], dates[::5])
|
plt.yticks(yRange[::5], dates[::5])
|
||||||
plt.xticks(xRange[::10], propsIDs[::10])
|
|
||||||
plt.title(filterList[filter])
|
|
||||||
plt.xlabel("Property ID")
|
plt.xlabel("Property ID")
|
||||||
plt.ylabel("Scrape Date")
|
plt.ylabel("Scrape Date")
|
||||||
plt.colorbar()
|
|
||||||
plt.tight_layout()
|
|
||||||
|
|
||||||
plt.show()
|
plt.show()
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
from etl.src import data
|
import data
|
||||||
from etl.src.data import etl_pipelines as ep
|
from data import etl_pipelines as ep
|
||||||
import polars as pl
|
import polars as pl
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
Loading…
Reference in New Issue