ConsultancyProject_2_ETL/etl/src/data/etl_region_capacities.py

50 lines
1.6 KiB
Python

from datetime import date
from io import StringIO
import polars as pl
import data
d = data.load()
def region_capacities(id: int):
# Get Data
if id == -1:
extractions = d.capacity_global().pl()
else:
extractions = d.capacity_of_region(id).pl()
# turn PropertyIDs to ints for sorting
extractions = extractions.cast({"property_id": int})
extractions.drop('property_id')
df_dates = pl.DataFrame()
# Get Data from JSON
gridData = pl.DataFrame(schema=[("scrape_date", pl.String), ("sum_hor", pl.Int64), ("calendar_width", pl.Int64)])
dayCounts = []
for row in extractions.rows(named=True):
# Return 0 for sum if calendar is null
if row['calendarBody']:
calDF = pl.read_json(StringIO(row['calendarBody']))
sum_hor = calDF.sum_horizontal()[0]
else:
sum_hor = 0
gridData = gridData.vstack(pl.DataFrame({"scrape_date" : row['ScrapeDate'], "sum_hor": sum_hor, "calendar_width": calDF.width}))
# Create Aggregates of values
df_count = gridData.group_by("scrape_date").agg(pl.col("sum_hor").count())
df_sum = gridData.group_by("scrape_date").agg(pl.col("sum_hor").sum())
df_numDays = gridData.group_by("scrape_date").agg(pl.col("calendar_width").max())
# Join and rename DF's
df = df_sum.join(df_count, on= 'scrape_date').join(df_numDays, on= 'scrape_date')
# Calculate normed capacities for each scrapeDate
df = df.with_columns((pl.col("sum_hor") / pl.col("sum_hor_right") / (pl.col("calendar_width")*2) * 100).alias("capacity"))
# Sort the date column
df = df.cast({"scrape_date": date}).sort('scrape_date')
result = {"capacities": df['capacity'].to_list(), "dates": df['scrape_date'].to_list()}
return result