2 ETL Pipelines hinzugefügt, Untersuchung Vorbuchungszeit begonnen
parent
d15664df43
commit
1e0b9f1233
|
@ -196,16 +196,17 @@ class Database:
|
||||||
""")
|
""")
|
||||||
|
|
||||||
def extractions(self):
|
def extractions(self):
|
||||||
return self.connection.sql("""
|
return self.connection.sql(f"""
|
||||||
SELECT
|
SELECT
|
||||||
JSON_EXTRACT(exception, '$.status') AS exception_status,
|
JSON_EXTRACT(body, '$.content.days') as calendar,
|
||||||
COUNT(JSON_EXTRACT(exception, '$.status')) AS exception_count
|
property_id,
|
||||||
|
created_at
|
||||||
FROM
|
FROM
|
||||||
consultancy_d.extractions
|
consultancy_d.extractions
|
||||||
WHERE
|
WHERE
|
||||||
type != 'property'
|
type == 'calendar'
|
||||||
GROUP BY
|
ORDER BY
|
||||||
extractions.date
|
property_id
|
||||||
""")
|
""")
|
||||||
|
|
||||||
def extractions_for(self, property_id):
|
def extractions_for(self, property_id):
|
||||||
|
@ -236,4 +237,33 @@ class Database:
|
||||||
GROUP BY
|
GROUP BY
|
||||||
date
|
date
|
||||||
ORDER BY date ASC
|
ORDER BY date ASC
|
||||||
""""
|
""")
|
||||||
|
|
||||||
|
def price(self):
|
||||||
|
return self.connection.sql("""
|
||||||
|
SELECT
|
||||||
|
JSON_EXTRACT(body, '$.content.lowestPrice.valueWeekRaw') AS pricePerWeek,
|
||||||
|
JSON_EXTRACT(body, '$.content.lowestPrice.valueNightRaw') AS pricePerNight,
|
||||||
|
JSON_EXTRACT(body, '$.content.lowestPrice.currency') AS currency,
|
||||||
|
property_id,
|
||||||
|
created_at
|
||||||
|
FROM
|
||||||
|
consultancy_d.extractions
|
||||||
|
WHERE
|
||||||
|
type == 'price'
|
||||||
|
ORDER BY property_id
|
||||||
|
""")
|
||||||
|
|
||||||
|
def price_developement_per_property(self):
|
||||||
|
return self.connection.sql("""
|
||||||
|
SELECT
|
||||||
|
JSON_EXTRACT(body, '$.content.lowestPrice.valueNightRaw') AS pricePerNight,
|
||||||
|
property_id,
|
||||||
|
created_at
|
||||||
|
FROM
|
||||||
|
consultancy_d.extractions
|
||||||
|
WHERE
|
||||||
|
type == 'price'
|
||||||
|
ORDER BY property_id
|
||||||
|
""")
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,47 @@
|
||||||
|
import polars as pl
|
||||||
|
import json
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
def expansion_Pipeline(df):
|
||||||
|
'''
|
||||||
|
Rearranges a given extractions Dataframe into an expanded Dataframe.
|
||||||
|
New Columns :propId, created_at calendar_date, calendar_value
|
||||||
|
:param df: Inputs from database.py/extractions or database.py/extractions_for functions
|
||||||
|
:return: expanded dataframe
|
||||||
|
'''
|
||||||
|
data = []
|
||||||
|
|
||||||
|
for row in df.iter_rows():
|
||||||
|
propId = row[1]
|
||||||
|
createdAt = row[2]
|
||||||
|
if row[0]:
|
||||||
|
temp = json.loads(row[0])
|
||||||
|
keys = temp.keys()
|
||||||
|
for key in keys:
|
||||||
|
out = [propId, createdAt.date(), datetime.strptime(key, '%Y-%m-%d').date(), temp[key]]
|
||||||
|
data.append(out)
|
||||||
|
|
||||||
|
df = pl.DataFrame(data, schema=["property_id", "created_at", "calendar_date", "calendar_value"])
|
||||||
|
return df
|
||||||
|
|
||||||
|
|
||||||
|
def liveDates_Pipeline(df):
|
||||||
|
'''
|
||||||
|
Returns the expanded Dataframe with only the live data and no future data
|
||||||
|
:param df: Inputs from database.py/extractions or database.py/extractions_for functions
|
||||||
|
:return: expanded and filtered dataframe
|
||||||
|
'''
|
||||||
|
df = expansion_Pipeline(df)
|
||||||
|
print(df)
|
||||||
|
df = df.filter(pl.col("calendar_date") == pl.col("created_at")+timedelta(days=2))
|
||||||
|
return df
|
||||||
|
|
||||||
|
def liveDates_PipelineFromExpanded(df):
|
||||||
|
'''
|
||||||
|
Filters an already expanded df and returns only the live data and no future data
|
||||||
|
NOTE: The actual live date and the next is always 0. The reason is most likely that it is forbidden to
|
||||||
|
book on the current or next day. Workaround: Compare with the day after tomorrow
|
||||||
|
:param df: Inputs from expansion_Pipeline
|
||||||
|
:return: expanded and filtered dataframe
|
||||||
|
'''
|
||||||
|
df = df.filter(pl.col("calendar_date") == pl.col("created_at")+timedelta(days=2))
|
||||||
|
return df
|
|
@ -1,10 +1,36 @@
|
||||||
import Data_Analysis as DA
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
def getAccuracy(df, baseLine, compLine):
|
||||||
|
try:
|
||||||
|
df = df.iloc[[baseLine,compLine]]
|
||||||
|
except IndexError:
|
||||||
|
return -1
|
||||||
|
total = 0
|
||||||
|
noChange = 0
|
||||||
|
first = True
|
||||||
|
for series_name, series in df.items():
|
||||||
|
if first:
|
||||||
|
first = False
|
||||||
|
else:
|
||||||
|
total += 1
|
||||||
|
#print(series_name)
|
||||||
|
if series[baseLine] != -1:
|
||||||
|
if series[compLine] != -1:
|
||||||
|
if series[baseLine] == series[compLine]:
|
||||||
|
noChange += 1
|
||||||
|
|
||||||
|
accuracy = noChange / total
|
||||||
|
return accuracy
|
||||||
|
|
||||||
|
def getMeanAccuracy(accList):
|
||||||
|
out = []
|
||||||
|
for row in accList:
|
||||||
|
row = [x for x in row if x != -1]
|
||||||
|
out.append(np.average(row))
|
||||||
|
return out
|
||||||
|
|
||||||
deltaList = [1, 2, 10, 20]
|
deltaList = [1, 2, 10, 20]
|
||||||
#1 = 1 Scrape Interval
|
#1 = 1 Scrape Interval
|
||||||
|
@ -33,12 +59,12 @@ for file in os.listdir(directory):
|
||||||
accList = []
|
accList = []
|
||||||
#Loop through all Dates as Baseline date
|
#Loop through all Dates as Baseline date
|
||||||
for i in range(df.shape[0]):
|
for i in range(df.shape[0]):
|
||||||
acc = DA.getAccuracy(df, i, i+delta)
|
acc = getAccuracy(df, i, i+delta)
|
||||||
accList.append(acc)
|
accList.append(acc)
|
||||||
fullList.append(accList)
|
fullList.append(accList)
|
||||||
|
|
||||||
|
|
||||||
meanList = DA.getMeanAccuracy(fullList)
|
meanList = getMeanAccuracy(fullList)
|
||||||
accListDf = accListDf._append({'property_id': propId, 'timedelay_1': fullList[0], 'timedelay_2': fullList[1], 'timedelay_10': fullList[2], 'timedelay_20': fullList[3]}, ignore_index=True)
|
accListDf = accListDf._append({'property_id': propId, 'timedelay_1': fullList[0], 'timedelay_2': fullList[1], 'timedelay_10': fullList[2], 'timedelay_20': fullList[3]}, ignore_index=True)
|
||||||
accMeanDf = accMeanDf._append({'property_id': propId, 'timedelay_1': meanList[0], 'timedelay_2': meanList[1], 'timedelay_10': meanList[2], 'timedelay_20': meanList[3]}, ignore_index=True)
|
accMeanDf = accMeanDf._append({'property_id': propId, 'timedelay_1': meanList[0], 'timedelay_2': meanList[1], 'timedelay_10': meanList[2], 'timedelay_20': meanList[3]}, ignore_index=True)
|
||||||
|
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,27 @@
|
||||||
|
import polars as pl
|
||||||
|
import pandas as pd
|
||||||
|
import data
|
||||||
|
|
||||||
|
inst = data.load()
|
||||||
|
#test = inst.price().pl()
|
||||||
|
test = inst.price_developement_per_property().pl()
|
||||||
|
|
||||||
|
data = []
|
||||||
|
|
||||||
|
propIds = test.get_column('property_id').to_list()
|
||||||
|
propIds = list(dict.fromkeys(propIds))
|
||||||
|
for id in propIds:
|
||||||
|
dates = []
|
||||||
|
prices = []
|
||||||
|
new_dict = {}
|
||||||
|
temp = test.filter(property_id=id)
|
||||||
|
for row in temp.iter_rows():
|
||||||
|
new_dict[row[2].date().strftime("%Y/%m/%d")] = row[0]
|
||||||
|
|
||||||
|
|
||||||
|
data.append([id, new_dict])
|
||||||
|
|
||||||
|
df = pd.DataFrame(data)
|
||||||
|
|
||||||
|
df.to_csv('results/priceAccuracyDict.csv')
|
||||||
|
print(df)
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,48 @@
|
||||||
|
import re
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
import ast
|
||||||
|
from datetime import datetime
|
||||||
|
import json
|
||||||
|
'''
|
||||||
|
### For priceAccuracySeparate.csv
|
||||||
|
|
||||||
|
|
||||||
|
df = pd.read_csv('results/priceAccuracySeparate.csv')
|
||||||
|
df.drop(columns=['Unnamed: 0'], inplace=True)
|
||||||
|
df = df.rename({'0': 'property_id', '1': 'dates', '2': 'prices'}, axis=1)
|
||||||
|
|
||||||
|
|
||||||
|
subset = df.loc[df['property_id'] == 10]
|
||||||
|
x = ast.literal_eval(subset['dates'].tolist()[0])
|
||||||
|
x_date = [datetime.strptime(item, '%Y/%m/%d') for item in x]
|
||||||
|
y = ast.literal_eval(subset['prices'].tolist()[0])
|
||||||
|
y_int = [float(item) for item in y]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
plt.plot(x_date, y_int)
|
||||||
|
plt.show()
|
||||||
|
'''
|
||||||
|
df = pd.read_csv('results/priceAccuracyDict.csv')
|
||||||
|
df.drop(columns=['Unnamed: 0'], inplace=True)
|
||||||
|
df = df.rename({'0': 'property_id', '1': 'dicts'}, axis=1)
|
||||||
|
|
||||||
|
subset = df.loc[df['property_id'] == 10]
|
||||||
|
|
||||||
|
temp = subset['dicts'].iloc[0]
|
||||||
|
temp = re.sub("'", '"', temp)
|
||||||
|
curr_dict = json.loads(temp)
|
||||||
|
|
||||||
|
|
||||||
|
keys = curr_dict.keys()
|
||||||
|
keys = [datetime.strptime(item, "%Y/%m/%d") for item in keys]
|
||||||
|
vals = curr_dict.values()
|
||||||
|
vals = [float(item) for item in vals]
|
||||||
|
|
||||||
|
plt.plot(curr_dict.keys(), vals)
|
||||||
|
plt.xticks(rotation=45)
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,34 @@
|
||||||
|
import data
|
||||||
|
from data import etl_pipelines as ep
|
||||||
|
import polars as pl
|
||||||
|
|
||||||
|
|
||||||
|
'''
|
||||||
|
#Create Data
|
||||||
|
inst = data.load()
|
||||||
|
|
||||||
|
df = inst.extractions().pl()
|
||||||
|
df = ep.liveDates_Pipeline(df)
|
||||||
|
|
||||||
|
df.write_csv('dok/liveDates.csv')
|
||||||
|
print(df)
|
||||||
|
'''
|
||||||
|
|
||||||
|
#Load Data
|
||||||
|
df = pl.read_csv('dok/liveDates.csv')
|
||||||
|
|
||||||
|
propIds = df.get_column('property_id').unique()
|
||||||
|
|
||||||
|
createdAt = df.get_column('created_at').unique()
|
||||||
|
|
||||||
|
for propId in propIds:
|
||||||
|
for createdAt in createdAt:
|
||||||
|
temp = df.filter(pl.col("created_at") == createdAt)
|
||||||
|
temp = temp.filter(pl.col("property_id") == propId)
|
||||||
|
if temp.shape[0] > 0:
|
||||||
|
print(temp.get_column('calendar_value')[0])
|
||||||
|
else:
|
||||||
|
print(0)
|
||||||
|
|
||||||
|
|
||||||
|
#Hier weiter
|
Loading…
Reference in New Issue