2 ETL Pipelines hinzugefügt, Untersuchung Vorbuchungszeit begonnen
parent
d15664df43
commit
1e0b9f1233
|
@ -196,16 +196,17 @@ class Database:
|
|||
""")
|
||||
|
||||
def extractions(self):
|
||||
return self.connection.sql("""
|
||||
SELECT
|
||||
JSON_EXTRACT(exception, '$.status') AS exception_status,
|
||||
COUNT(JSON_EXTRACT(exception, '$.status')) AS exception_count
|
||||
FROM
|
||||
consultancy_d.extractions
|
||||
return self.connection.sql(f"""
|
||||
SELECT
|
||||
JSON_EXTRACT(body, '$.content.days') as calendar,
|
||||
property_id,
|
||||
created_at
|
||||
FROM
|
||||
consultancy_d.extractions
|
||||
WHERE
|
||||
type != 'property'
|
||||
GROUP BY
|
||||
extractions.date
|
||||
type == 'calendar'
|
||||
ORDER BY
|
||||
property_id
|
||||
""")
|
||||
|
||||
def extractions_for(self, property_id):
|
||||
|
@ -236,4 +237,33 @@ class Database:
|
|||
GROUP BY
|
||||
date
|
||||
ORDER BY date ASC
|
||||
""""
|
||||
""")
|
||||
|
||||
def price(self):
|
||||
return self.connection.sql("""
|
||||
SELECT
|
||||
JSON_EXTRACT(body, '$.content.lowestPrice.valueWeekRaw') AS pricePerWeek,
|
||||
JSON_EXTRACT(body, '$.content.lowestPrice.valueNightRaw') AS pricePerNight,
|
||||
JSON_EXTRACT(body, '$.content.lowestPrice.currency') AS currency,
|
||||
property_id,
|
||||
created_at
|
||||
FROM
|
||||
consultancy_d.extractions
|
||||
WHERE
|
||||
type == 'price'
|
||||
ORDER BY property_id
|
||||
""")
|
||||
|
||||
def price_developement_per_property(self):
|
||||
return self.connection.sql("""
|
||||
SELECT
|
||||
JSON_EXTRACT(body, '$.content.lowestPrice.valueNightRaw') AS pricePerNight,
|
||||
property_id,
|
||||
created_at
|
||||
FROM
|
||||
consultancy_d.extractions
|
||||
WHERE
|
||||
type == 'price'
|
||||
ORDER BY property_id
|
||||
""")
|
||||
|
||||
|
|
|
@ -0,0 +1,47 @@
|
|||
import polars as pl
|
||||
import json
|
||||
from datetime import datetime, timedelta
|
||||
def expansion_Pipeline(df):
|
||||
'''
|
||||
Rearranges a given extractions Dataframe into an expanded Dataframe.
|
||||
New Columns :propId, created_at calendar_date, calendar_value
|
||||
:param df: Inputs from database.py/extractions or database.py/extractions_for functions
|
||||
:return: expanded dataframe
|
||||
'''
|
||||
data = []
|
||||
|
||||
for row in df.iter_rows():
|
||||
propId = row[1]
|
||||
createdAt = row[2]
|
||||
if row[0]:
|
||||
temp = json.loads(row[0])
|
||||
keys = temp.keys()
|
||||
for key in keys:
|
||||
out = [propId, createdAt.date(), datetime.strptime(key, '%Y-%m-%d').date(), temp[key]]
|
||||
data.append(out)
|
||||
|
||||
df = pl.DataFrame(data, schema=["property_id", "created_at", "calendar_date", "calendar_value"])
|
||||
return df
|
||||
|
||||
|
||||
def liveDates_Pipeline(df):
|
||||
'''
|
||||
Returns the expanded Dataframe with only the live data and no future data
|
||||
:param df: Inputs from database.py/extractions or database.py/extractions_for functions
|
||||
:return: expanded and filtered dataframe
|
||||
'''
|
||||
df = expansion_Pipeline(df)
|
||||
print(df)
|
||||
df = df.filter(pl.col("calendar_date") == pl.col("created_at")+timedelta(days=2))
|
||||
return df
|
||||
|
||||
def liveDates_PipelineFromExpanded(df):
|
||||
'''
|
||||
Filters an already expanded df and returns only the live data and no future data
|
||||
NOTE: The actual live date and the next is always 0. The reason is most likely that it is forbidden to
|
||||
book on the current or next day. Workaround: Compare with the day after tomorrow
|
||||
:param df: Inputs from expansion_Pipeline
|
||||
:return: expanded and filtered dataframe
|
||||
'''
|
||||
df = df.filter(pl.col("calendar_date") == pl.col("created_at")+timedelta(days=2))
|
||||
return df
|
|
@ -1,10 +1,36 @@
|
|||
import Data_Analysis as DA
|
||||
import pandas as pd
|
||||
import os
|
||||
import re
|
||||
import numpy as np
|
||||
|
||||
def getAccuracy(df, baseLine, compLine):
|
||||
try:
|
||||
df = df.iloc[[baseLine,compLine]]
|
||||
except IndexError:
|
||||
return -1
|
||||
total = 0
|
||||
noChange = 0
|
||||
first = True
|
||||
for series_name, series in df.items():
|
||||
if first:
|
||||
first = False
|
||||
else:
|
||||
total += 1
|
||||
#print(series_name)
|
||||
if series[baseLine] != -1:
|
||||
if series[compLine] != -1:
|
||||
if series[baseLine] == series[compLine]:
|
||||
noChange += 1
|
||||
|
||||
accuracy = noChange / total
|
||||
return accuracy
|
||||
|
||||
def getMeanAccuracy(accList):
|
||||
out = []
|
||||
for row in accList:
|
||||
row = [x for x in row if x != -1]
|
||||
out.append(np.average(row))
|
||||
return out
|
||||
|
||||
deltaList = [1, 2, 10, 20]
|
||||
#1 = 1 Scrape Interval
|
||||
|
@ -33,12 +59,12 @@ for file in os.listdir(directory):
|
|||
accList = []
|
||||
#Loop through all Dates as Baseline date
|
||||
for i in range(df.shape[0]):
|
||||
acc = DA.getAccuracy(df, i, i+delta)
|
||||
acc = getAccuracy(df, i, i+delta)
|
||||
accList.append(acc)
|
||||
fullList.append(accList)
|
||||
|
||||
|
||||
meanList = DA.getMeanAccuracy(fullList)
|
||||
meanList = getMeanAccuracy(fullList)
|
||||
accListDf = accListDf._append({'property_id': propId, 'timedelay_1': fullList[0], 'timedelay_2': fullList[1], 'timedelay_10': fullList[2], 'timedelay_20': fullList[3]}, ignore_index=True)
|
||||
accMeanDf = accMeanDf._append({'property_id': propId, 'timedelay_1': meanList[0], 'timedelay_2': meanList[1], 'timedelay_10': meanList[2], 'timedelay_20': meanList[3]}, ignore_index=True)
|
||||
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,27 @@
|
|||
import polars as pl
|
||||
import pandas as pd
|
||||
import data
|
||||
|
||||
inst = data.load()
|
||||
#test = inst.price().pl()
|
||||
test = inst.price_developement_per_property().pl()
|
||||
|
||||
data = []
|
||||
|
||||
propIds = test.get_column('property_id').to_list()
|
||||
propIds = list(dict.fromkeys(propIds))
|
||||
for id in propIds:
|
||||
dates = []
|
||||
prices = []
|
||||
new_dict = {}
|
||||
temp = test.filter(property_id=id)
|
||||
for row in temp.iter_rows():
|
||||
new_dict[row[2].date().strftime("%Y/%m/%d")] = row[0]
|
||||
|
||||
|
||||
data.append([id, new_dict])
|
||||
|
||||
df = pd.DataFrame(data)
|
||||
|
||||
df.to_csv('results/priceAccuracyDict.csv')
|
||||
print(df)
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,48 @@
|
|||
import re
|
||||
|
||||
import pandas as pd
|
||||
import matplotlib.pyplot as plt
|
||||
import ast
|
||||
from datetime import datetime
|
||||
import json
|
||||
'''
|
||||
### For priceAccuracySeparate.csv
|
||||
|
||||
|
||||
df = pd.read_csv('results/priceAccuracySeparate.csv')
|
||||
df.drop(columns=['Unnamed: 0'], inplace=True)
|
||||
df = df.rename({'0': 'property_id', '1': 'dates', '2': 'prices'}, axis=1)
|
||||
|
||||
|
||||
subset = df.loc[df['property_id'] == 10]
|
||||
x = ast.literal_eval(subset['dates'].tolist()[0])
|
||||
x_date = [datetime.strptime(item, '%Y/%m/%d') for item in x]
|
||||
y = ast.literal_eval(subset['prices'].tolist()[0])
|
||||
y_int = [float(item) for item in y]
|
||||
|
||||
|
||||
|
||||
plt.plot(x_date, y_int)
|
||||
plt.show()
|
||||
'''
|
||||
df = pd.read_csv('results/priceAccuracyDict.csv')
|
||||
df.drop(columns=['Unnamed: 0'], inplace=True)
|
||||
df = df.rename({'0': 'property_id', '1': 'dicts'}, axis=1)
|
||||
|
||||
subset = df.loc[df['property_id'] == 10]
|
||||
|
||||
temp = subset['dicts'].iloc[0]
|
||||
temp = re.sub("'", '"', temp)
|
||||
curr_dict = json.loads(temp)
|
||||
|
||||
|
||||
keys = curr_dict.keys()
|
||||
keys = [datetime.strptime(item, "%Y/%m/%d") for item in keys]
|
||||
vals = curr_dict.values()
|
||||
vals = [float(item) for item in vals]
|
||||
|
||||
plt.plot(curr_dict.keys(), vals)
|
||||
plt.xticks(rotation=45)
|
||||
plt.show()
|
||||
|
||||
|
|
@ -0,0 +1,34 @@
|
|||
import data
|
||||
from data import etl_pipelines as ep
|
||||
import polars as pl
|
||||
|
||||
|
||||
'''
|
||||
#Create Data
|
||||
inst = data.load()
|
||||
|
||||
df = inst.extractions().pl()
|
||||
df = ep.liveDates_Pipeline(df)
|
||||
|
||||
df.write_csv('dok/liveDates.csv')
|
||||
print(df)
|
||||
'''
|
||||
|
||||
#Load Data
|
||||
df = pl.read_csv('dok/liveDates.csv')
|
||||
|
||||
propIds = df.get_column('property_id').unique()
|
||||
|
||||
createdAt = df.get_column('created_at').unique()
|
||||
|
||||
for propId in propIds:
|
||||
for createdAt in createdAt:
|
||||
temp = df.filter(pl.col("created_at") == createdAt)
|
||||
temp = temp.filter(pl.col("property_id") == propId)
|
||||
if temp.shape[0] > 0:
|
||||
print(temp.get_column('calendar_value')[0])
|
||||
else:
|
||||
print(0)
|
||||
|
||||
|
||||
#Hier weiter
|
Loading…
Reference in New Issue