2 ETL Pipelines hinzugefügt, Untersuchung Vorbuchungszeit begonnen

2024-11-27 01:00:17 +01:00 · 2024-11-27 01:00:17 +01:00 · 1e0b9f1233
commit 1e0b9f1233
parent d15664df43
10 changed files with 108912 additions and 13 deletions
--- a/src/data/database.py
+++ b/src/data/database.py
@ -196,16 +196,17 @@ class Database:
 		""")

 	def extractions(self):
-		return self.connection.sql("""
-			SELECT
-			  JSON_EXTRACT(exception, '$.status') AS exception_status,
-			  COUNT(JSON_EXTRACT(exception, '$.status')) AS exception_count
-			FROM
-			  consultancy_d.extractions
+		return self.connection.sql(f"""
+			SELECT 
+				JSON_EXTRACT(body, '$.content.days') as calendar,
+				property_id,
+				created_at 
+			FROM 
+				consultancy_d.extractions
 			WHERE
-			  type != 'property'
-			GROUP BY
-			  extractions.date
+				type == 'calendar'
+			ORDER BY
+				property_id
 		""")

 	def extractions_for(self, property_id):
@ -236,4 +237,33 @@ class Database:
 			GROUP BY
 				date
 			ORDER BY date ASC
-		""""
+		""")
+
+	def price(self):
+		return self.connection.sql("""
+			SELECT 
+				JSON_EXTRACT(body, '$.content.lowestPrice.valueWeekRaw') AS pricePerWeek,
+				JSON_EXTRACT(body, '$.content.lowestPrice.valueNightRaw') AS pricePerNight,
+				JSON_EXTRACT(body, '$.content.lowestPrice.currency') AS currency,
+				property_id,
+				created_at 
+			FROM 
+				consultancy_d.extractions
+			WHERE
+				type == 'price'
+			ORDER BY property_id
+		""")
+
+	def price_developement_per_property(self):
+		return self.connection.sql("""
+			SELECT 
+				JSON_EXTRACT(body, '$.content.lowestPrice.valueNightRaw') AS pricePerNight,
+				property_id,
+				created_at
+			FROM 
+				consultancy_d.extractions
+			WHERE
+				type == 'price'
+			ORDER BY property_id
+		""")
+
--- a/src/data/etl_pipelines.py
+++ b/src/data/etl_pipelines.py
@ -0,0 +1,47 @@
+import polars as pl
+import json
+from datetime import datetime, timedelta
+def expansion_Pipeline(df):
+    '''
+    Rearranges a given extractions Dataframe into an expanded Dataframe.
+    New Columns :propId, created_at calendar_date, calendar_value
+    :param df: Inputs from database.py/extractions or database.py/extractions_for functions
+    :return: expanded dataframe
+    '''
+    data = []
+
+    for row in df.iter_rows():
+        propId = row[1]
+        createdAt = row[2]
+        if row[0]:
+            temp = json.loads(row[0])
+            keys = temp.keys()
+            for key in keys:
+                out = [propId, createdAt.date(), datetime.strptime(key, '%Y-%m-%d').date(), temp[key]]
+                data.append(out)
+
+    df = pl.DataFrame(data, schema=["property_id", "created_at", "calendar_date", "calendar_value"])
+    return df
+
+
+def liveDates_Pipeline(df):
+    '''
+    Returns the expanded Dataframe with only the live data and no future data
+    :param df: Inputs from database.py/extractions or database.py/extractions_for functions
+    :return: expanded and filtered dataframe
+    '''
+    df = expansion_Pipeline(df)
+    print(df)
+    df = df.filter(pl.col("calendar_date") == pl.col("created_at")+timedelta(days=2))
+    return df
+
+def liveDates_PipelineFromExpanded(df):
+    '''
+    Filters an already expanded df and returns only the live data and no future data
+    NOTE: The actual live date and the next is always 0. The reason is most likely that it is forbidden to
+    book on the current or next day. Workaround: Compare with the day after tomorrow
+    :param df: Inputs from expansion_Pipeline
+    :return: expanded and filtered dataframe
+    '''
+    df = df.filter(pl.col("calendar_date") == pl.col("created_at")+timedelta(days=2))
+    return df
--- a/src/mauro/createAccuracyValues.py
+++ b/src/mauro/createAccuracyValues.py
@ -1,10 +1,36 @@
-import Data_Analysis as DA
 import pandas as pd
 import os
 import re
+import numpy as np

+def getAccuracy(df, baseLine, compLine):
+    try:
+        df = df.iloc[[baseLine,compLine]]
+    except IndexError:
+        return -1
+    total = 0
+    noChange = 0
+    first = True
+    for series_name, series in df.items():
+        if first:
+            first = False
+        else:
+            total += 1
+            #print(series_name)
+            if series[baseLine] != -1:
+                if series[compLine] != -1:
+                    if series[baseLine] == series[compLine]:
+                        noChange += 1

+    accuracy = noChange / total
+    return accuracy

+def getMeanAccuracy(accList):
+    out = []
+    for row in accList:
+        row = [x for x in row if x != -1]
+        out.append(np.average(row))
+    return out

 deltaList = [1, 2, 10, 20]
 #1 = 1 Scrape Interval
@ -33,12 +59,12 @@ for file in os.listdir(directory):
            accList = []
            #Loop through all Dates as Baseline date
            for i in range(df.shape[0]):
-                acc = DA.getAccuracy(df, i, i+delta)
+                acc = getAccuracy(df, i, i+delta)
                accList.append(acc)
            fullList.append(accList)


-        meanList = DA.getMeanAccuracy(fullList)
+        meanList = getMeanAccuracy(fullList)
        accListDf = accListDf._append({'property_id': propId, 'timedelay_1': fullList[0], 'timedelay_2': fullList[1], 'timedelay_10': fullList[2], 'timedelay_20': fullList[3]}, ignore_index=True)
        accMeanDf = accMeanDf._append({'property_id': propId, 'timedelay_1': meanList[0], 'timedelay_2': meanList[1], 'timedelay_10': meanList[2], 'timedelay_20': meanList[3]}, ignore_index=True)

--- a/src/mauro/dok/liveDates.csv
+++ b/src/mauro/dok/liveDates.csv
--- a/src/mauro/priceAccuracy.py
+++ b/src/mauro/priceAccuracy.py
@ -0,0 +1,27 @@
+import polars as pl
+import pandas as pd
+import data
+
+inst = data.load()
+#test = inst.price().pl()
+test = inst.price_developement_per_property().pl()
+
+data = []
+
+propIds = test.get_column('property_id').to_list()
+propIds = list(dict.fromkeys(propIds))
+for id in propIds:
+    dates = []
+    prices = []
+    new_dict = {}
+    temp = test.filter(property_id=id)
+    for row in temp.iter_rows():
+        new_dict[row[2].date().strftime("%Y/%m/%d")] = row[0]
+
+
+    data.append([id, new_dict])
+
+df = pd.DataFrame(data)
+
+df.to_csv('results/priceAccuracyDict.csv')
+print(df)
--- a/src/mauro/results/priceAccuracy.csv
+++ b/src/mauro/results/priceAccuracy.csv
--- a/src/mauro/results/priceAccuracyDict.csv
+++ b/src/mauro/results/priceAccuracyDict.csv
--- a/src/mauro/results/priceAccuracySeparate.csv
+++ b/src/mauro/results/priceAccuracySeparate.csv
--- a/src/mauro/testAccu.py
+++ b/src/mauro/testAccu.py
@ -0,0 +1,48 @@
+import re
+
+import pandas as pd
+import matplotlib.pyplot as plt
+import ast
+from datetime import datetime
+import json
+'''
+### For priceAccuracySeparate.csv
+
+
+df = pd.read_csv('results/priceAccuracySeparate.csv')
+df.drop(columns=['Unnamed: 0'], inplace=True)
+df = df.rename({'0': 'property_id', '1': 'dates', '2': 'prices'}, axis=1)
+
+
+subset = df.loc[df['property_id'] == 10]
+x = ast.literal_eval(subset['dates'].tolist()[0])
+x_date = [datetime.strptime(item, '%Y/%m/%d') for item in x]
+y = ast.literal_eval(subset['prices'].tolist()[0])
+y_int = [float(item) for item in y]
+
+
+
+plt.plot(x_date, y_int)
+plt.show()
+'''
+df = pd.read_csv('results/priceAccuracyDict.csv')
+df.drop(columns=['Unnamed: 0'], inplace=True)
+df = df.rename({'0': 'property_id', '1': 'dicts'}, axis=1)
+
+subset = df.loc[df['property_id'] == 10]
+
+temp = subset['dicts'].iloc[0]
+temp = re.sub("'", '"', temp)
+curr_dict = json.loads(temp)
+
+
+keys = curr_dict.keys()
+keys = [datetime.strptime(item,  "%Y/%m/%d") for item in keys]
+vals = curr_dict.values()
+vals = [float(item) for item in vals]
+
+plt.plot(curr_dict.keys(), vals)
+plt.xticks(rotation=45)
+plt.show()
+
+
--- a/src/mauro/vorbuchungsZeit.py
+++ b/src/mauro/vorbuchungsZeit.py
@ -0,0 +1,34 @@
+import data
+from data import etl_pipelines as ep
+import polars as pl
+
+
+'''
+#Create Data
+inst = data.load()
+
+df = inst.extractions().pl()
+df = ep.liveDates_Pipeline(df)
+
+df.write_csv('dok/liveDates.csv')
+print(df)
+'''
+
+#Load Data
+df = pl.read_csv('dok/liveDates.csv')
+
+propIds = df.get_column('property_id').unique()
+
+createdAt = df.get_column('created_at').unique()
+
+for propId in propIds:
+    for createdAt in createdAt:
+        temp = df.filter(pl.col("created_at") == createdAt)
+        temp = temp.filter(pl.col("property_id") == propId)
+        if temp.shape[0] > 0:
+            print(temp.get_column('calendar_value')[0])
+        else:
+            print(0)
+
+
+#Hier weiter