From a8b856b714b95378f272ad26c5e8069c13dfd8be Mon Sep 17 00:00:00 2001 From: mmaurostoffel <166130318+mmaurostoffel@users.noreply.github.com> Date: Fri, 20 Dec 2024 20:57:10 +0100 Subject: [PATCH] =?UTF-8?q?etl=5Fregion=5Fcapacities=20erstellt=20+=20data?= =?UTF-8?q?base=20und=20api/main=20Anpassungen=20daf=C3=BCr?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- etl/src/api/main.py | 6 +++ etl/src/data/database.py | 17 +++++++- etl/src/data/etl_region_capacities.py | 57 +++++++++++++++++++++++++++ etl/src/mauro/data_quality.py | 2 - 4 files changed, 78 insertions(+), 4 deletions(-) create mode 100644 etl/src/data/etl_region_capacities.py diff --git a/etl/src/api/main.py b/etl/src/api/main.py index cc68ed8..81a8571 100644 --- a/etl/src/api/main.py +++ b/etl/src/api/main.py @@ -44,4 +44,10 @@ def property_capacities_data(id: int): def property_base_data(id: int): return d.property_base_data(id).pl().to_dicts() +@app.get("/region/{id}/capacities") +def region_capacities_data(id: int): + capacities = etl_pc.region_capacities(id) + return capacities + + diff --git a/etl/src/data/database.py b/etl/src/data/database.py index ca15b2d..d020dbf 100644 --- a/etl/src/data/database.py +++ b/etl/src/data/database.py @@ -28,8 +28,6 @@ class Database: if(spatial_installed and not spatial_installed[0]): self.connection.sql("INSTALL spatial") - - def db_overview(self): return self.connection.sql("DESCRIBE;").show() @@ -403,5 +401,20 @@ class Database: consultancy_d.properties p """) + def capacity_of_region(self, region_id): + return self.connection.sql(f""" + SELECT + JSON_EXTRACT(body, '$.content.days') as calendarBody, + strftime(extractions.created_at, '%Y-%m-%d') AS ScrapeDate, + extractions.property_id, + FROM + consultancy_d.extractions + LEFT JOIN + consultancy_d.properties ON properties.id = extractions.property_id + WHERE + type == 'calendar' AND + properties.seed_id = {region_id} + """) + diff --git a/etl/src/data/etl_region_capacities.py b/etl/src/data/etl_region_capacities.py new file mode 100644 index 0000000..3d65f3f --- /dev/null +++ b/etl/src/data/etl_region_capacities.py @@ -0,0 +1,57 @@ +from etl.src import data +import polars as pl +from io import StringIO +import numpy as np + + +d = data.load() + +def region_capacities(id: int): + # Get Data + df = d.capacity_of_region(id).pl() + # turn PropertyIDs to ints for sorting + df = df.cast({"property_id": int}) + + # Get uniques for dates and propIDs and sort them + listOfDates = df.get_column("ScrapeDate").unique().sort() + listOfPropertyIDs = df.get_column("property_id").unique().sort() + + # Create DFs from lists to merge later + datesDF = pl.DataFrame(listOfDates).with_row_index("date_index") + propIdDF = pl.DataFrame(listOfPropertyIDs).with_row_index("prop_index") + + # Merge Dataframe to generate indices + df = df.join(datesDF, on='ScrapeDate') + df = df.join(propIdDF, on='property_id') + + # Drop now useless columns ScrapeDate and property_id + df = df[['calendarBody', 'date_index', 'prop_index']] + + # Calculate grid values + gridData = [] + for row in df.rows(named=True): + # Return 0 for sum if calendar is null + if row['calendarBody']: + calDF = pl.read_json(StringIO(row['calendarBody'])) + sum_hor = calDF.sum_horizontal()[0] + else: + sum_hor = 0 + gridData.append([row['prop_index'], row['date_index'], sum_hor]) + gridData = np.array(gridData) + + # get all values to calculate Max + allValues = gridData[:, 2] + maxValue = np.max(allValues) + gridData[:, 2] = (gridData[:, 2]*100)/maxValue + + # Return back to list + gridData = gridData.tolist() + + # Cast listOfDates to datetime + listOfDates = listOfDates.cast(pl.Date).to_list() + listOfPropertyIDs = listOfPropertyIDs.to_list() + + # Create JSON + outDict = {'scrapeDates': listOfDates, 'property_ids': listOfPropertyIDs, 'values': gridData} + + return outDict diff --git a/etl/src/mauro/data_quality.py b/etl/src/mauro/data_quality.py index 0530133..8c00f4b 100644 --- a/etl/src/mauro/data_quality.py +++ b/etl/src/mauro/data_quality.py @@ -31,8 +31,6 @@ dfNew = pl.from_dicts(data) dfNew.write_csv('results/data_quality.csv') print(dfNew) - - ''' dfNew = pl.read_csv('results/data_quality.csv') dfNew = dfNew.with_columns(pl.col("created_at").map_elements(lambda x: datetime.strptime(x, "%Y-%m-%d").date()))