diff --git a/etl/src/data/etl_region_capacities.py b/etl/src/data/etl_region_capacities.py index 989cf5b..39b0b0b 100644 --- a/etl/src/data/etl_region_capacities.py +++ b/etl/src/data/etl_region_capacities.py @@ -1,5 +1,5 @@ -from io import StringIO from datetime import date +from io import StringIO import polars as pl @@ -21,7 +21,7 @@ def region_capacities(id: int): df_dates = pl.DataFrame() # Get Data from JSON - gridData = [] + gridData = pl.DataFrame(schema=[("scrape_date", pl.String), ("sum_hor", pl.Int64), ("calendar_width", pl.Int64)]) dayCounts = [] for row in extractions.rows(named=True): # Return 0 for sum if calendar is null @@ -30,24 +30,21 @@ def region_capacities(id: int): sum_hor = calDF.sum_horizontal()[0] else: sum_hor = 0 - gridData.append([row['ScrapeDate'], sum_hor, calDF.width]) + gridData = gridData.vstack(pl.DataFrame({"scrape_date" : row['ScrapeDate'], "sum_hor": sum_hor, "calendar_width": calDF.width})) # Create Aggregates of values - df = pl.DataFrame(gridData, strict=False) - df_count = df.group_by("column_0").agg(pl.col("column_1").count()) - df_sum = df.group_by("column_0").agg(pl.col("column_1").sum()) - df_numDays = df.group_by("column_0").agg(pl.col("column_2").max()) + df_count = gridData.group_by("scrape_date").agg(pl.col("sum_hor").count()) + df_sum = gridData.group_by("scrape_date").agg(pl.col("sum_hor").sum()) + df_numDays = gridData.group_by("scrape_date").agg(pl.col("calendar_width").max()) # Join and rename DF's - df = df_sum.join(df_count, on= 'column_0').join(df_numDays, on= 'column_0') - df = df.rename({"column_0": "ScrapeDate", "column_1": "Sum", "column_1_right": "num_properties", "column_2": "max_value", }) + df = df_sum.join(df_count, on= 'scrape_date').join(df_numDays, on= 'scrape_date') # Calculate normed capacities for each scrapeDate - df = df.with_columns((pl.col("Sum") / pl.col("num_properties") / (pl.col("max_value")*2) * 100).alias("capacity")) + df = df.with_columns((pl.col("sum_hor") / pl.col("sum_hor_right") / (pl.col("calendar_width")*2) * 100).alias("capacity")) # Sort the date column - df = df.cast({"ScrapeDate": date}) - df = df.sort('ScrapeDate') + df = df.cast({"scrape_date": date}).sort('scrape_date') - result = {"capacities": df['capacity'].to_list(), "dates": df['ScrapeDate'].to_list()} + result = {"capacities": df['capacity'].to_list(), "dates": df['scrape_date'].to_list()} return result \ No newline at end of file