ConsultancyProject_2_ETL/etl/src/data/etl_region_capacities_monthly.py
2025-01-18 15:39:29 +01:00

65 lines
2.6 KiB
Python

from datetime import datetime, timedelta
from io import StringIO
import polars as pl
import data
from data import etl_cache
d = data.load()
def region_capacities_monthly(id: int, scrapeDate_start: str):
file = f"etl_region_capacities_monthly_{id}_{scrapeDate_start}.obj"
obj = etl_cache.openObj(file)
if obj:
return obj
# Get end date of start search-window
scrapeDate_end = scrapeDate_start + timedelta(days=1)
# Get Data
if id == -1:
extractions = d.singleScrape_of_global_scrapDate(scrapeDate_start, scrapeDate_end).pl()
else:
extractions = d.singleScrape_of_region_scrapDate(id, scrapeDate_start, scrapeDate_end).pl()
df_calendar = pl.DataFrame()
numWeeks = 0
firstExe = True
counter = 0
for row in extractions.rows(named=True):
scrapeDate = row['created_at']
if row['calendarBody']:
counter += 1
df_calendar = pl.read_json(StringIO(row['calendarBody']))
columnTitles = df_calendar.columns
df_calendar = df_calendar.transpose()
df_calendar = df_calendar.with_columns(pl.Series(name="dates", values=columnTitles))
df_calendar = df_calendar.with_columns((pl.col("dates").str.to_date()))
df_calendar = df_calendar.with_columns((pl.col("dates").dt.month_end().dt.day().alias('numDays')))
df_calendar = df_calendar.with_columns((pl.col("dates").dt.strftime("%b") + " " + (pl.col("dates").dt.strftime("%Y"))).alias('date_short'))
df_calendar = df_calendar.with_columns((pl.col("dates").dt.strftime("%Y") + " " + (pl.col("dates").dt.strftime("%m"))).alias('dates'))
df_calendar = df_calendar.group_by(['dates', 'date_short','numDays']).agg(pl.col("column_0").sum())
df_calendar = df_calendar.with_columns((pl.col("column_0") / pl.col("numDays") / 2 * 100).alias("column_0"))
df_calendar = df_calendar.sort('dates')
df_calendar = df_calendar.drop('dates')
df_calendar = df_calendar.drop('numDays')
df_calendar = df_calendar.rename({'column_0': str(counter)})
if firstExe:
outDf = df_calendar
firstExe = False
else:
outDf = outDf.join(df_calendar, on='date_short')
# Calculate horizontal Mean
means = outDf.mean_horizontal()
outDf = outDf.insert_column(1, means)
outDf = outDf[['date_short', 'mean']]
result = {"date": scrapeDate, "months": outDf['date_short'].to_list(),'capacities': outDf['mean'].to_list()}
etl_cache.saveObj(file, result)
return result