Untersuchung Vorbuchungszeit abgeschlossen

main
mmaurostoffel 2024-11-28 16:10:53 +01:00
parent 1e0b9f1233
commit 338d3e9cc2
4 changed files with 1422 additions and 15 deletions

2
.gitignore vendored
View File

@ -23,6 +23,7 @@
*.ipr *.ipr
.idea/ .idea/
# eclipse project file # eclipse project file
.settings/ .settings/
.classpath .classpath
@ -65,3 +66,4 @@ env3.*/
# duckdb # duckdb
*.duckdb *.duckdb
/src/mauro/dok/

View File

@ -71,6 +71,18 @@ class Database:
regions.name regions.name
""") """)
def propIds_with_region(self):
return self.connection.sql("""
SELECT
properties.id, seed_id, regions.name
FROM
consultancy_d.properties
LEFT JOIN
consultancy_d.seeds ON seeds.id = properties.seed_id
LEFT JOIN
consultancy_d.regions ON regions.id = seeds.region_id
""")
def properties_unreachable(self): def properties_unreachable(self):
return self.connection.sql(""" return self.connection.sql("""
SELECT SELECT

1312
src/mauro/dok/fullPreoDF.csv Normal file

File diff suppressed because it is too large Load Diff

View File

@ -1,34 +1,115 @@
import data import data
from data import etl_pipelines as ep from data import etl_pipelines as ep
import polars as pl import polars as pl
from datetime import datetime, timedelta
import pandas as pd
''' '''
#Create Data # Get Data from DB
inst = data.load() inst = data.load()
df = inst.extractions().pl() df = inst.extractions().pl()
df = ep.liveDates_Pipeline(df) df = ep.expansion_Pipeline(df)
df.write_csv('dok/liveDates.csv') df.write_csv('dok/flatDates.csv')
print(df) print(df)
''' '''
'''
#Load Data from DF
dfLive = pl.read_csv('dok/liveDates.csv')
dfFlat = pl.read_csv('dok/flatDates.csv')
#Load Data
df = pl.read_csv('dok/liveDates.csv')
propIds = df.get_column('property_id').unique() # Step 1 Get all occupied dates in live data
dfLive = dfLive.filter(pl.col("calendar_value") == 0)
dfLive = dfLive.with_columns(pl.col("created_at").str.to_date("%Y-%m-%d"))
dfLive = dfLive.with_columns(pl.col("calendar_date").str.to_date("%Y-%m-%d"))
#print(dfLive)
createdAt = df.get_column('created_at').unique() dfFlat = dfFlat.with_columns(pl.col("created_at").str.to_date("%Y-%m-%d"))
dfFlat = dfFlat.with_columns(pl.col("calendar_date").str.to_date("%Y-%m-%d"))
propIds = dfLive.get_column('property_id').unique()
createdAt = dfLive.get_column('created_at').unique()
#print(createdAt)
fullPreorderMatrix = []
for propId in propIds: for propId in propIds:
for createdAt in createdAt: curPreorderList = []
temp = df.filter(pl.col("created_at") == createdAt) print("Property ID = " + str(propId))
temp = temp.filter(pl.col("property_id") == propId) tempPropFlatDf = dfFlat.filter(pl.col("property_id") == propId)
if temp.shape[0] > 0: tempPropLiveDf = dfLive.filter(pl.col("property_id") == propId)
print(temp.get_column('calendar_value')[0]) allLiveOccupiedDates = tempPropLiveDf.filter(pl.col("calendar_value") == 0).get_column('created_at')
#print("allLiveOccupiedDates = ",allLiveOccupiedDates)
for date in allLiveOccupiedDates:
calLiveDate = tempPropLiveDf.filter(pl.col("created_at") == date).get_column('calendar_date')[0]
#print("Occupied Date = " + str(date), "with Calendar Date =", str(calLiveDate))
numOfScrapedPreordered = 0
foundLastDate = False
for createDate in createdAt:
if date > createDate:
#print("Finding Flat Date with CreateDate =",createDate, "and Calendar Date =", calLiveDate)
tempFlatDf = tempPropFlatDf.filter(pl.col("created_at") == createDate)
tempFlatDf = tempFlatDf.filter(pl.col("calendar_date") == calLiveDate)
#print("tempLiveDf = ", tempFlatDf)
calVal = tempFlatDf.get_column('calendar_value')
if len(calVal) > 0:
if calVal[0] == 0:
# Still Occupied
if not foundLastDate:
numOfScrapedPreordered += 1
else: else:
print(0) # Found last Date where not occupied
foundLastDate = True
#print("number of Scrapes already occupied =", numOfScrapedPreordered)
#break
#else:
#print("Skipped: Live Date = ",date, "Flat Date =",createDate)
#print(propId, date, numOfScrapedPreordered)
curPreorderList.append(numOfScrapedPreordered)
if len(curPreorderList) > 0:
mean = sum(curPreorderList) / len(curPreorderList)
else: mean = 0
#fullPreorderMatrix.append([propId, mean, curPreorderList])
fullPreorderMatrix.append([propId, mean])
print(fullPreorderMatrix)
fullPreoDF = pl.DataFrame(fullPreorderMatrix,orient="row")
fullPreoDF.write_csv('dok/fullPreoDF.csv')
print(fullPreoDF)
'''
# Filter Props to locations and calculate Means per location
inst = data.load()
propDf = inst.propIds_with_region().pl()
print(propDf)
propDf = propDf.select(
pl.col("id").cast(pl.Int64),
pl.col("seed_id").cast(pl.Int64),
)
preoDF = pl.read_csv('dok/fullPreoDF.csv')
preoDF = preoDF.rename({"column_0": "id", "column_1": "meanPreorderScrapeNum"})
#Hier weiter merge = preoDF.join(propDf, how='inner', on='id')
print(merge)
print("Global meanPreorderTime = ",round(merge.get_column("meanPreorderScrapeNum").mean()*3,2))
# 1 = Heidiland
heidi = merge.filter(pl.col("seed_id") == 1)
print("Heidiland meanPreorderTime = ",round(heidi.get_column("meanPreorderScrapeNum").mean()*3,2))
# 2 = Davos
Davos = merge.filter(pl.col("seed_id") == 2)
print("Davos meanPreorderTime = ",round(Davos.get_column("meanPreorderScrapeNum").mean()*3,2))
# 3 = Engadin
Engadin = merge.filter(pl.col("seed_id") == 3)
print("Engadin meanPreorderTime = ",round(Engadin.get_column("meanPreorderScrapeNum").mean()*3,2))
# 4 = St. Moritz
Moritz = merge.filter(pl.col("seed_id") == 4)
print("St. Moritz meanPreorderTime = ",round(Moritz.get_column("meanPreorderScrapeNum").mean()*3,2))