From fcd7ca34add35ed17926107c01373ac9a51d8613 Mon Sep 17 00:00:00 2001 From: mmaurostoffel <166130318+mmaurostoffel@users.noreply.github.com> Date: Mon, 13 Jan 2025 17:08:37 +0100 Subject: [PATCH] closes #11 --- etl/src/api/main.py | 6 ++ etl/src/data/database.py | 16 ++++++ .../data/etl_property_capacities_weekdays.py | 2 +- .../data/etl_region_capacities_weekdays.py | 56 +++++++++++++++++++ 4 files changed, 79 insertions(+), 1 deletion(-) create mode 100644 etl/src/data/etl_region_capacities_weekdays.py diff --git a/etl/src/api/main.py b/etl/src/api/main.py index 21b4307..8e82f6f 100644 --- a/etl/src/api/main.py +++ b/etl/src/api/main.py @@ -5,6 +5,7 @@ from data import etl_property_capacities_monthly as etl_pcm from data import etl_property_capacities_weekdays as etl_pcw from data import etl_property_neighbours as etl_pn from data import etl_region_capacities as etl_rc +from data import etl_region_capacities_weekdays as etl_rcw from data import etl_region_movAverage as etl_rmA from data import etl_region_properties_capacities as etl_rpc from data import etl_region_capacities_comparison as etl_rcc @@ -77,6 +78,11 @@ def region_capacities_data(id: int): capacities = etl_rc.region_capacities(id) return capacities +@app.get("/region/{id}/capacities/weekdays/{scrapeDate}") +def region_capacities_data(id: int, scrapeDate: str): + capacities = etl_rcw.region_capacities_weekdays(id, scrapeDate) + return capacities + @app.get("/region/capacities/comparison/{id_1}/{id_2}") def region_capacities_data(id_1: int, id_2: int): capacities = etl_rcc.region_capacities_comparison(id_1, id_2) diff --git a/etl/src/data/database.py b/etl/src/data/database.py index 2d937b2..7bf3731 100644 --- a/etl/src/data/database.py +++ b/etl/src/data/database.py @@ -473,6 +473,22 @@ class Database: extractions.created_at < '{scrape_date_max}' """) + def singleScrape_of_region_scrapDate(self, region_id: int, scrape_date_min: str, scrape_date_max: str): + return self.connection.sql(f""" + SELECT + JSON_EXTRACT(body, '$.content.days') as calendarBody, + extractions.created_at + FROM + consultancy_d.extractions + LEFT JOIN + consultancy_d.properties ON properties.id = extractions.property_id + WHERE + type == 'calendar' AND + properties.seed_id = {region_id} AND + extractions.created_at >= '{scrape_date_min}' AND + extractions.created_at < '{scrape_date_max}' + """) + def capacity_global(self): return self.connection.sql(f""" SELECT diff --git a/etl/src/data/etl_property_capacities_weekdays.py b/etl/src/data/etl_property_capacities_weekdays.py index c16fcb2..e8e6f45 100644 --- a/etl/src/data/etl_property_capacities_weekdays.py +++ b/etl/src/data/etl_property_capacities_weekdays.py @@ -25,7 +25,7 @@ def property_capacities_weekdays(id: int, scrapeDate: str): df_calendar = df_calendar.drop("dates") df_calendar = df_calendar.group_by(["weekday", "weekday_num"]).agg(pl.col("column_0").sum()) - df_calendar = df_calendar.with_columns((pl.col("column_0") / numWeeks * 100).alias("column_0")) + df_calendar = df_calendar.with_columns((pl.col("column_0") / numWeeks / 2 * 100).alias("column_0")) df_calendar = df_calendar.sort('weekday_num') df_calendar = df_calendar.drop('weekday_num') diff --git a/etl/src/data/etl_region_capacities_weekdays.py b/etl/src/data/etl_region_capacities_weekdays.py new file mode 100644 index 0000000..c97bc93 --- /dev/null +++ b/etl/src/data/etl_region_capacities_weekdays.py @@ -0,0 +1,56 @@ +from io import StringIO + +import polars as pl + +import data + +from datetime import datetime, timedelta + +d = data.load() + + +def region_capacities_weekdays(id: int, scrapeDate_start: str): + # String to Date + scrapeDate_start = datetime.strptime(scrapeDate_start, '%Y-%m-%d') + + # Get end date of start search-window + scrapeDate_end = scrapeDate_start + timedelta(days=1) + + extractions = d.singleScrape_of_region_scrapDate(id, scrapeDate_start, scrapeDate_end).pl() + df_calendar = pl.DataFrame() + numWeeks = 0 + firstExe = True + counter = 0 + for row in extractions.rows(named=True): + scrapeDate = row['created_at'] + if row['calendarBody']: + counter += 1 + df_calendar = pl.read_json(StringIO(row['calendarBody'])) + columnTitles = df_calendar.columns + df_calendar = df_calendar.transpose() + df_calendar = df_calendar.with_columns(pl.Series(name="dates", values=columnTitles)) + df_calendar = df_calendar.with_columns((pl.col("dates").str.to_date())) + numWeeks = round((df_calendar.get_column("dates").max() - df_calendar.get_column("dates").min()).days / 7, 0) + df_calendar = df_calendar.with_columns(pl.col("dates").dt.weekday().alias("weekday_num")) + df_calendar = df_calendar.with_columns(pl.col("dates").dt.strftime("%A").alias("weekday")) + df_calendar = df_calendar.drop("dates") + + df_calendar = df_calendar.group_by(["weekday", "weekday_num"]).agg(pl.col("column_0").sum()) + df_calendar = df_calendar.with_columns((pl.col("column_0") / numWeeks / 2 * 100).alias("column_0")) + df_calendar = df_calendar.sort('weekday_num') + df_calendar = df_calendar.drop('weekday_num') + df_calendar = df_calendar.rename({'column_0': str(counter)}) + + if firstExe: + outDf = df_calendar + firstExe = False + else: + outDf = outDf.join(df_calendar, on='weekday') + + # Calculate horizontal Mean + means = outDf.mean_horizontal() + outDf = outDf.insert_column(1, means) + outDf = outDf[['weekday', 'mean']] + + result = {"scraping-date": scrapeDate, "weekdays": outDf['weekday'].to_list(),'capacities': outDf['mean'].to_list()} + return result \ No newline at end of file