Betrifft #7. Möglicher fix, bitte Resultat kontrollieren. Das Problem lag m.E. darin, dass durch das hin und her zwischen Listen und DataFrame die Typisierungen der Werte verloren gehen, weshalb es dann auch entsprechenden Fehler schmeisste.

This commit is contained in:
Giò Diani 2025-01-12 11:56:33 +01:00
parent e176d1e73f
commit 67c0d85213

View File

@ -1,5 +1,5 @@
from io import StringIO
from datetime import date
from io import StringIO
import polars as pl
@ -21,7 +21,7 @@ def region_capacities(id: int):
df_dates = pl.DataFrame()
# Get Data from JSON
gridData = []
gridData = pl.DataFrame(schema=[("scrape_date", pl.String), ("sum_hor", pl.Int64), ("calendar_width", pl.Int64)])
dayCounts = []
for row in extractions.rows(named=True):
# Return 0 for sum if calendar is null
@ -30,24 +30,21 @@ def region_capacities(id: int):
sum_hor = calDF.sum_horizontal()[0]
else:
sum_hor = 0
gridData.append([row['ScrapeDate'], sum_hor, calDF.width])
gridData = gridData.vstack(pl.DataFrame({"scrape_date" : row['ScrapeDate'], "sum_hor": sum_hor, "calendar_width": calDF.width}))
# Create Aggregates of values
df = pl.DataFrame(gridData, strict=False)
df_count = df.group_by("column_0").agg(pl.col("column_1").count())
df_sum = df.group_by("column_0").agg(pl.col("column_1").sum())
df_numDays = df.group_by("column_0").agg(pl.col("column_2").max())
df_count = gridData.group_by("scrape_date").agg(pl.col("sum_hor").count())
df_sum = gridData.group_by("scrape_date").agg(pl.col("sum_hor").sum())
df_numDays = gridData.group_by("scrape_date").agg(pl.col("calendar_width").max())
# Join and rename DF's
df = df_sum.join(df_count, on= 'column_0').join(df_numDays, on= 'column_0')
df = df.rename({"column_0": "ScrapeDate", "column_1": "Sum", "column_1_right": "num_properties", "column_2": "max_value", })
df = df_sum.join(df_count, on= 'scrape_date').join(df_numDays, on= 'scrape_date')
# Calculate normed capacities for each scrapeDate
df = df.with_columns((pl.col("Sum") / pl.col("num_properties") / (pl.col("max_value")*2) * 100).alias("capacity"))
df = df.with_columns((pl.col("sum_hor") / pl.col("sum_hor_right") / (pl.col("calendar_width")*2) * 100).alias("capacity"))
# Sort the date column
df = df.cast({"ScrapeDate": date})
df = df.sort('ScrapeDate')
df = df.cast({"scrape_date": date}).sort('scrape_date')
result = {"capacities": df['capacity'].to_list(), "dates": df['ScrapeDate'].to_list()}
result = {"capacities": df['capacity'].to_list(), "dates": df['scrape_date'].to_list()}
return result