2 ETL Pipelines hinzugefügt, Untersuchung Vorbuchungszeit begonnen

refactor-to-mono
mmaurostoffel 2024-11-27 01:00:17 +01:00
parent d15664df43
commit 1e0b9f1233
10 changed files with 108912 additions and 13 deletions

View File

@ -196,16 +196,17 @@ class Database:
""")
def extractions(self):
return self.connection.sql("""
return self.connection.sql(f"""
SELECT
JSON_EXTRACT(exception, '$.status') AS exception_status,
COUNT(JSON_EXTRACT(exception, '$.status')) AS exception_count
JSON_EXTRACT(body, '$.content.days') as calendar,
property_id,
created_at
FROM
consultancy_d.extractions
WHERE
type != 'property'
GROUP BY
extractions.date
type == 'calendar'
ORDER BY
property_id
""")
def extractions_for(self, property_id):
@ -236,4 +237,33 @@ class Database:
GROUP BY
date
ORDER BY date ASC
""""
""")
def price(self):
return self.connection.sql("""
SELECT
JSON_EXTRACT(body, '$.content.lowestPrice.valueWeekRaw') AS pricePerWeek,
JSON_EXTRACT(body, '$.content.lowestPrice.valueNightRaw') AS pricePerNight,
JSON_EXTRACT(body, '$.content.lowestPrice.currency') AS currency,
property_id,
created_at
FROM
consultancy_d.extractions
WHERE
type == 'price'
ORDER BY property_id
""")
def price_developement_per_property(self):
return self.connection.sql("""
SELECT
JSON_EXTRACT(body, '$.content.lowestPrice.valueNightRaw') AS pricePerNight,
property_id,
created_at
FROM
consultancy_d.extractions
WHERE
type == 'price'
ORDER BY property_id
""")

47
src/data/etl_pipelines.py Normal file
View File

@ -0,0 +1,47 @@
import polars as pl
import json
from datetime import datetime, timedelta
def expansion_Pipeline(df):
'''
Rearranges a given extractions Dataframe into an expanded Dataframe.
New Columns :propId, created_at calendar_date, calendar_value
:param df: Inputs from database.py/extractions or database.py/extractions_for functions
:return: expanded dataframe
'''
data = []
for row in df.iter_rows():
propId = row[1]
createdAt = row[2]
if row[0]:
temp = json.loads(row[0])
keys = temp.keys()
for key in keys:
out = [propId, createdAt.date(), datetime.strptime(key, '%Y-%m-%d').date(), temp[key]]
data.append(out)
df = pl.DataFrame(data, schema=["property_id", "created_at", "calendar_date", "calendar_value"])
return df
def liveDates_Pipeline(df):
'''
Returns the expanded Dataframe with only the live data and no future data
:param df: Inputs from database.py/extractions or database.py/extractions_for functions
:return: expanded and filtered dataframe
'''
df = expansion_Pipeline(df)
print(df)
df = df.filter(pl.col("calendar_date") == pl.col("created_at")+timedelta(days=2))
return df
def liveDates_PipelineFromExpanded(df):
'''
Filters an already expanded df and returns only the live data and no future data
NOTE: The actual live date and the next is always 0. The reason is most likely that it is forbidden to
book on the current or next day. Workaround: Compare with the day after tomorrow
:param df: Inputs from expansion_Pipeline
:return: expanded and filtered dataframe
'''
df = df.filter(pl.col("calendar_date") == pl.col("created_at")+timedelta(days=2))
return df

View File

@ -1,10 +1,36 @@
import Data_Analysis as DA
import pandas as pd
import os
import re
import numpy as np
def getAccuracy(df, baseLine, compLine):
try:
df = df.iloc[[baseLine,compLine]]
except IndexError:
return -1
total = 0
noChange = 0
first = True
for series_name, series in df.items():
if first:
first = False
else:
total += 1
#print(series_name)
if series[baseLine] != -1:
if series[compLine] != -1:
if series[baseLine] == series[compLine]:
noChange += 1
accuracy = noChange / total
return accuracy
def getMeanAccuracy(accList):
out = []
for row in accList:
row = [x for x in row if x != -1]
out.append(np.average(row))
return out
deltaList = [1, 2, 10, 20]
#1 = 1 Scrape Interval
@ -33,12 +59,12 @@ for file in os.listdir(directory):
accList = []
#Loop through all Dates as Baseline date
for i in range(df.shape[0]):
acc = DA.getAccuracy(df, i, i+delta)
acc = getAccuracy(df, i, i+delta)
accList.append(acc)
fullList.append(accList)
meanList = DA.getMeanAccuracy(fullList)
meanList = getMeanAccuracy(fullList)
accListDf = accListDf._append({'property_id': propId, 'timedelay_1': fullList[0], 'timedelay_2': fullList[1], 'timedelay_10': fullList[2], 'timedelay_20': fullList[3]}, ignore_index=True)
accMeanDf = accMeanDf._append({'property_id': propId, 'timedelay_1': meanList[0], 'timedelay_2': meanList[1], 'timedelay_10': meanList[2], 'timedelay_20': meanList[3]}, ignore_index=True)

47978
src/mauro/dok/liveDates.csv Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,27 @@
import polars as pl
import pandas as pd
import data
inst = data.load()
#test = inst.price().pl()
test = inst.price_developement_per_property().pl()
data = []
propIds = test.get_column('property_id').to_list()
propIds = list(dict.fromkeys(propIds))
for id in propIds:
dates = []
prices = []
new_dict = {}
temp = test.filter(property_id=id)
for row in temp.iter_rows():
new_dict[row[2].date().strftime("%Y/%m/%d")] = row[0]
data.append([id, new_dict])
df = pd.DataFrame(data)
df.to_csv('results/priceAccuracyDict.csv')
print(df)

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

48
src/mauro/testAccu.py Normal file
View File

@ -0,0 +1,48 @@
import re
import pandas as pd
import matplotlib.pyplot as plt
import ast
from datetime import datetime
import json
'''
### For priceAccuracySeparate.csv
df = pd.read_csv('results/priceAccuracySeparate.csv')
df.drop(columns=['Unnamed: 0'], inplace=True)
df = df.rename({'0': 'property_id', '1': 'dates', '2': 'prices'}, axis=1)
subset = df.loc[df['property_id'] == 10]
x = ast.literal_eval(subset['dates'].tolist()[0])
x_date = [datetime.strptime(item, '%Y/%m/%d') for item in x]
y = ast.literal_eval(subset['prices'].tolist()[0])
y_int = [float(item) for item in y]
plt.plot(x_date, y_int)
plt.show()
'''
df = pd.read_csv('results/priceAccuracyDict.csv')
df.drop(columns=['Unnamed: 0'], inplace=True)
df = df.rename({'0': 'property_id', '1': 'dicts'}, axis=1)
subset = df.loc[df['property_id'] == 10]
temp = subset['dicts'].iloc[0]
temp = re.sub("'", '"', temp)
curr_dict = json.loads(temp)
keys = curr_dict.keys()
keys = [datetime.strptime(item, "%Y/%m/%d") for item in keys]
vals = curr_dict.values()
vals = [float(item) for item in vals]
plt.plot(curr_dict.keys(), vals)
plt.xticks(rotation=45)
plt.show()

View File

@ -0,0 +1,34 @@
import data
from data import etl_pipelines as ep
import polars as pl
'''
#Create Data
inst = data.load()
df = inst.extractions().pl()
df = ep.liveDates_Pipeline(df)
df.write_csv('dok/liveDates.csv')
print(df)
'''
#Load Data
df = pl.read_csv('dok/liveDates.csv')
propIds = df.get_column('property_id').unique()
createdAt = df.get_column('created_at').unique()
for propId in propIds:
for createdAt in createdAt:
temp = df.filter(pl.col("created_at") == createdAt)
temp = temp.filter(pl.col("property_id") == propId)
if temp.shape[0] > 0:
print(temp.get_column('calendar_value')[0])
else:
print(0)
#Hier weiter