ConsultancyProject_2_ETL/etl/src/data/etl_region_properties_capacities.py
2025-01-18 15:39:29 +01:00

64 lines
2.1 KiB
Python

from io import StringIO
import polars as pl
import data
from data import etl_cache
d = data.load()
def region_properties_capacities(id: int):
file = f"etl_region_properties_capacities_{id}.obj"
obj = etl_cache.openObj(file)
if obj:
return obj
# Get Data
if id == -1:
df = d.capacity_global().pl()
else:
df = d.capacity_of_region(id).pl()
# turn PropertyIDs to ints for sorting
df = df.cast({"property_id": int})
# Get uniques for dates and propIDs and sort them
listOfDates = df.get_column("ScrapeDate").unique().sort()
listOfPropertyIDs = df.get_column("property_id").unique().sort()
# Create DFs from lists to merge later
datesDF = pl.DataFrame(listOfDates).with_row_index("date_index")
propIdDF = pl.DataFrame(listOfPropertyIDs).with_row_index("prop_index")
# Merge Dataframe to generate indices
df = df.join(datesDF, on='ScrapeDate')
df = df.join(propIdDF, on='property_id')
# Calculate grid values
gridData = pl.DataFrame(schema=[("scrape_date", pl.String), ("property_id", pl.String), ("sum_hor", pl.Int64)])
for row in df.rows(named=True):
# Return 0 for sum if calendar is null
if row['calendarBody']:
calDF = pl.read_json(StringIO(row['calendarBody']))
sum_hor = calDF.sum_horizontal()[0]
else:
sum_hor = 0
gridData = gridData.vstack(pl.DataFrame({"scrape_date" : row['ScrapeDate'], "property_id": str(row['property_id']), "sum_hor": sum_hor}))
# get the overall maximum sum
maxValue = gridData['sum_hor'].max()
values = []
for row in gridData.rows(named=True):
capacity = (row['sum_hor']*100)/maxValue
values.append({"date" : row['scrape_date'], "property_id": row['property_id'], "capacity": capacity})
# Cast listOfDates to datetime
listOfDates = listOfDates.cast(pl.Date).to_list()
listOfPropertyIDs = listOfPropertyIDs.cast(pl.String).to_list()
# Create JSON
outDict = {'dates': listOfDates, 'property_ids': listOfPropertyIDs, 'values': values}
etl_cache.saveObj(file, outDict)
return outDict