Untersuchung Vorbuchungszeit abgeschlossen

2024-11-28 16:10:53 +01:00 · 2024-11-28 16:10:53 +01:00 · 338d3e9cc2
commit 338d3e9cc2
parent 1e0b9f1233
4 changed files with 1422 additions and 15 deletions
--- a/.gitignore
+++ b/.gitignore
@ -23,6 +23,7 @@
 *.ipr
 .idea/
 # eclipse project file
 .settings/
 .classpath
@ -65,3 +66,4 @@ env3.*/
 # duckdb
 *.duckdb
 /src/mauro/dok/
--- a/src/data/database.py
+++ b/src/data/database.py
@ -71,6 +71,18 @@ class Database:
 				regions.name
 			""")
 	def propIds_with_region(self):
 		return self.connection.sql("""
 			SELECT 
 				properties.id, seed_id, regions.name
 			FROM 
 				consultancy_d.properties 
 			LEFT JOIN 
 				consultancy_d.seeds ON seeds.id = properties.seed_id 
 			LEFT JOIN 
 				consultancy_d.regions ON regions.id = seeds.region_id 
 			""")
 	def properties_unreachable(self):
 		return self.connection.sql("""
 			SELECT 
--- a/src/mauro/dok/fullPreoDF.csv
+++ b/src/mauro/dok/fullPreoDF.csv
--- a/src/mauro/vorbuchungsZeit.py
+++ b/src/mauro/vorbuchungsZeit.py
@ -1,34 +1,115 @@
 import data
 from data import etl_pipelines as ep
 import polars as pl
 from datetime import datetime, timedelta
 import pandas as pd
 '''
-#Create Data
+# Get Data from DB
 inst = data.load()
 df = inst.extractions().pl()
-df = ep.liveDates_Pipeline(df)
+df = ep.expansion_Pipeline(df)
-df.write_csv('dok/liveDates.csv')
+df.write_csv('dok/flatDates.csv')
 print(df)
 '''
 '''
 #Load Data from DF 
 dfLive = pl.read_csv('dok/liveDates.csv')
 dfFlat = pl.read_csv('dok/flatDates.csv')
 #Load Data
 df = pl.read_csv('dok/liveDates.csv')
-propIds = df.get_column('property_id').unique()
+# Step 1 Get all occupied dates in live data
 dfLive = dfLive.filter(pl.col("calendar_value") == 0)
 dfLive = dfLive.with_columns(pl.col("created_at").str.to_date("%Y-%m-%d"))
 dfLive = dfLive.with_columns(pl.col("calendar_date").str.to_date("%Y-%m-%d"))
 #print(dfLive)
-createdAt = df.get_column('created_at').unique()
+dfFlat = dfFlat.with_columns(pl.col("created_at").str.to_date("%Y-%m-%d"))
 dfFlat = dfFlat.with_columns(pl.col("calendar_date").str.to_date("%Y-%m-%d"))
 propIds = dfLive.get_column('property_id').unique()
 createdAt = dfLive.get_column('created_at').unique()
 #print(createdAt)
 fullPreorderMatrix = []
 for propId in propIds:
-    for createdAt in createdAt:
+    curPreorderList = []
-        temp = df.filter(pl.col("created_at") == createdAt)
+    print("Property ID = " + str(propId))
-        temp = temp.filter(pl.col("property_id") == propId)
+    tempPropFlatDf = dfFlat.filter(pl.col("property_id") == propId)
-        if temp.shape[0] > 0:
+    tempPropLiveDf = dfLive.filter(pl.col("property_id") == propId)
-            print(temp.get_column('calendar_value')[0])
+    allLiveOccupiedDates = tempPropLiveDf.filter(pl.col("calendar_value") == 0).get_column('created_at')
    #print("allLiveOccupiedDates = ",allLiveOccupiedDates)
    for date in allLiveOccupiedDates:
        calLiveDate = tempPropLiveDf.filter(pl.col("created_at") == date).get_column('calendar_date')[0]
        #print("Occupied Date = " + str(date), "with Calendar Date =", str(calLiveDate))
        numOfScrapedPreordered = 0
        foundLastDate = False
        for createDate in createdAt:
            if date > createDate:
                #print("Finding Flat Date with CreateDate =",createDate, "and Calendar Date =", calLiveDate)
                tempFlatDf = tempPropFlatDf.filter(pl.col("created_at") == createDate)
                tempFlatDf = tempFlatDf.filter(pl.col("calendar_date") == calLiveDate)
                #print("tempLiveDf = ", tempFlatDf)
                calVal = tempFlatDf.get_column('calendar_value')
                if len(calVal) > 0:
                    if calVal[0] == 0:
                        # Still Occupied
                        if not foundLastDate:
                            numOfScrapedPreordered += 1
                    else:
-            print(0)
+                        # Found last Date where not occupied
                        foundLastDate = True
                        #print("number of Scrapes already occupied =", numOfScrapedPreordered)
                        #break
            #else:
                #print("Skipped: Live Date = ",date, "Flat Date =",createDate)
        #print(propId, date, numOfScrapedPreordered)
        curPreorderList.append(numOfScrapedPreordered)
    if len(curPreorderList) > 0:
        mean = sum(curPreorderList) / len(curPreorderList)
    else: mean = 0
    #fullPreorderMatrix.append([propId, mean, curPreorderList])
    fullPreorderMatrix.append([propId, mean])
 print(fullPreorderMatrix)
 fullPreoDF = pl.DataFrame(fullPreorderMatrix,orient="row")
 fullPreoDF.write_csv('dok/fullPreoDF.csv')
 print(fullPreoDF)
 '''
 # Filter Props to locations and calculate Means per location
 inst = data.load()
 propDf = inst.propIds_with_region().pl()
 print(propDf)
 propDf = propDf.select(
    pl.col("id").cast(pl.Int64),
    pl.col("seed_id").cast(pl.Int64),
 )
 preoDF = pl.read_csv('dok/fullPreoDF.csv')
 preoDF = preoDF.rename({"column_0": "id", "column_1": "meanPreorderScrapeNum"})
-#Hier weiter
+merge = preoDF.join(propDf, how='inner', on='id')
 print(merge)
 print("Global meanPreorderTime = ",round(merge.get_column("meanPreorderScrapeNum").mean()*3,2))
 # 1 = Heidiland
 heidi = merge.filter(pl.col("seed_id") == 1)
 print("Heidiland meanPreorderTime = ",round(heidi.get_column("meanPreorderScrapeNum").mean()*3,2))
 # 2 = Davos
 Davos = merge.filter(pl.col("seed_id") == 2)
 print("Davos meanPreorderTime = ",round(Davos.get_column("meanPreorderScrapeNum").mean()*3,2))
 # 3 = Engadin
 Engadin = merge.filter(pl.col("seed_id") == 3)
 print("Engadin meanPreorderTime = ",round(Engadin.get_column("meanPreorderScrapeNum").mean()*3,2))
 # 4 = St. Moritz
 Moritz = merge.filter(pl.col("seed_id") == 4)
 print("St. Moritz meanPreorderTime = ",round(Moritz.get_column("meanPreorderScrapeNum").mean()*3,2))