From 67382003ca3bbe87df655445f53cfadeefdef459 Mon Sep 17 00:00:00 2001 From: mmaurostoffel <166130318+mmaurostoffel@users.noreply.github.com> Date: Sat, 11 Jan 2025 17:33:50 +0100 Subject: [PATCH] closes #7: etl_region_capacities erstellt !! Wie im Issue beschrieben wurde etl_region_capacities zu etl_region_properties_capacities angepasst und die Endpoints ebenfalls.!! !!Die Abfrage der globalen Daten ist implementiert und funktioniert, braucht aber recht lange!! --- etl/src/api/main.py | 6 ++ etl/src/data/etl_region_capacities.py | 86 +++++++++---------- .../data/etl_region_properties_capacities.py | 61 +++++++++++++ 3 files changed, 106 insertions(+), 47 deletions(-) create mode 100644 etl/src/data/etl_region_properties_capacities.py diff --git a/etl/src/api/main.py b/etl/src/api/main.py index fa2fdfe..f853e57 100644 --- a/etl/src/api/main.py +++ b/etl/src/api/main.py @@ -5,6 +5,7 @@ from data import etl_property_capacities_monthly as etl_pcm from data import etl_property_capacities_weekdays as etl_pcw from data import etl_property_neighbours as etl_pn from data import etl_region_capacities as etl_rc +from data import etl_region_properties_capacities as etl_rpc from data import etl_region_capacities_comparison as etl_rcc from fastapi import FastAPI, Response @@ -64,6 +65,11 @@ def property_capacities_data(id: int, scrapeDate: str): def property_base_data(id: int): return d.property_base_data(id).pl().to_dicts() +@app.get("/region/{id}/properties/capacities") +def region_property_capacities_data(id: int): + capacities = etl_rpc.region_properties_capacities(id) + return capacities + @app.get("/region/{id}/capacities") def region_capacities_data(id: int): capacities = etl_rc.region_capacities(id) diff --git a/etl/src/data/etl_region_capacities.py b/etl/src/data/etl_region_capacities.py index 1704258..db7a1fe 100644 --- a/etl/src/data/etl_region_capacities.py +++ b/etl/src/data/etl_region_capacities.py @@ -1,61 +1,53 @@ -import data -import polars as pl from io import StringIO -import numpy as np +from datetime import date +import polars as pl + +import data d = data.load() def region_capacities(id: int): - # Get Data - if id == -1: - df = d.capacity_global().pl() - else: - df = d.capacity_of_region(id).pl() - # turn PropertyIDs to ints for sorting - df = df.cast({"property_id": int}) - # Get uniques for dates and propIDs and sort them - listOfDates = df.get_column("ScrapeDate").unique().sort() - listOfPropertyIDs = df.get_column("property_id").unique().sort() + # Get Data + if id == -1: + extractions = d.capacity_global().pl() + else: + extractions = d.capacity_of_region(id).pl() + # turn PropertyIDs to ints for sorting + extractions = extractions.cast({"property_id": int}) - # Create DFs from lists to merge later - datesDF = pl.DataFrame(listOfDates).with_row_index("date_index") - propIdDF = pl.DataFrame(listOfPropertyIDs).with_row_index("prop_index") + extractions.drop('property_id') + df_dates = pl.DataFrame() - # Merge Dataframe to generate indices - df = df.join(datesDF, on='ScrapeDate') - df = df.join(propIdDF, on='property_id') - # Drop now useless columns ScrapeDate and property_id - df = df[['ScrapeDate', 'calendarBody', 'date_index', 'prop_index']] - # Calculate grid values - gridData = [] - for row in df.rows(named=True): - # Return 0 for sum if calendar is null - if row['calendarBody']: - calDF = pl.read_json(StringIO(row['calendarBody'])) - sum_hor = calDF.sum_horizontal()[0] - else: - sum_hor = 0 - # With Index - # gridData.append([row['prop_index'], row['date_index'], sum_hor]) - # With ScrapeDate - gridData.append([row['ScrapeDate'], row['date_index'], sum_hor]) + # Get Data from JSON + gridData = [] + dayCounts = [] + for row in extractions.rows(named=True): + # Return 0 for sum if calendar is null + if row['calendarBody']: + calDF = pl.read_json(StringIO(row['calendarBody'])) + sum_hor = calDF.sum_horizontal()[0] + else: + sum_hor = 0 + gridData.append([row['ScrapeDate'], sum_hor, calDF.width]) - gridData = np.array(gridData) - # get all values to calculate Max - allValues = gridData[:, 2].astype(int) - maxValue = np.max(allValues) - gridData[:, 2] = (allValues*100)/maxValue + # Create Aggregates of values + df = pl.DataFrame(gridData) + df_count = df.group_by("column_0").agg(pl.col("column_1").count()) + df_sum = df.group_by("column_0").agg(pl.col("column_1").sum()) + df_numDays = df.group_by("column_0").agg(pl.col("column_2").max()) - # Return back to list - gridData = gridData.tolist() + # Join and rename DF's + df = df_sum.join(df_count, on= 'column_0').join(df_numDays, on= 'column_0') + df = df.rename({"column_0": "ScrapeDate", "column_1": "Sum", "column_1_right": "num_properties", "column_2": "max_value", }) - # Cast listOfDates to datetime - listOfDates = listOfDates.cast(pl.Date).to_list() - listOfPropertyIDs = listOfPropertyIDs.to_list() + # Calculate normed capacities for each scrapeDate + df = df.with_columns((pl.col("Sum") / pl.col("num_properties") / (pl.col("max_value")*2) * 100).alias("capacity")) - # Create JSON - outDict = {'scrapeDates': listOfDates, 'property_ids': listOfPropertyIDs, 'values': gridData} + # Sort the date column + df = df.cast({"ScrapeDate": date}) + df = df.sort('ScrapeDate') - return outDict \ No newline at end of file + result = {"capacities": df['capacity'].to_list(), "dates": df['ScrapeDate'].to_list()} + return result \ No newline at end of file diff --git a/etl/src/data/etl_region_properties_capacities.py b/etl/src/data/etl_region_properties_capacities.py new file mode 100644 index 0000000..4cb351d --- /dev/null +++ b/etl/src/data/etl_region_properties_capacities.py @@ -0,0 +1,61 @@ +import data +import polars as pl +from io import StringIO +import numpy as np + + +d = data.load() + +def region_properties_capacities(id: int): + # Get Data + if id == -1: + df = d.capacity_global().pl() + else: + df = d.capacity_of_region(id).pl() + # turn PropertyIDs to ints for sorting + df = df.cast({"property_id": int}) + + # Get uniques for dates and propIDs and sort them + listOfDates = df.get_column("ScrapeDate").unique().sort() + listOfPropertyIDs = df.get_column("property_id").unique().sort() + + # Create DFs from lists to merge later + datesDF = pl.DataFrame(listOfDates).with_row_index("date_index") + propIdDF = pl.DataFrame(listOfPropertyIDs).with_row_index("prop_index") + + # Merge Dataframe to generate indices + df = df.join(datesDF, on='ScrapeDate') + df = df.join(propIdDF, on='property_id') + # Drop now useless columns ScrapeDate and property_id + df = df[['ScrapeDate', 'calendarBody', 'date_index', 'prop_index']] + # Calculate grid values + gridData = [] + for row in df.rows(named=True): + # Return 0 for sum if calendar is null + if row['calendarBody']: + calDF = pl.read_json(StringIO(row['calendarBody'])) + sum_hor = calDF.sum_horizontal()[0] + else: + sum_hor = 0 + # With Index + # gridData.append([row['prop_index'], row['date_index'], sum_hor]) + # With ScrapeDate + gridData.append([row['ScrapeDate'], row['date_index'], sum_hor]) + + gridData = np.array(gridData) + # get all values to calculate Max + allValues = gridData[:, 2].astype(int) + maxValue = np.max(allValues) + gridData[:, 2] = (allValues*100)/maxValue + + # Return back to list + gridData = gridData.tolist() + + # Cast listOfDates to datetime + listOfDates = listOfDates.cast(pl.Date).to_list() + listOfPropertyIDs = listOfPropertyIDs.to_list() + + # Create JSON + outDict = {'scrapeDates': listOfDates, 'property_ids': listOfPropertyIDs, 'values': gridData} + + return outDict \ No newline at end of file