etl_region_capacities erstellt + database und api/main Anpassungen dafür

main
mmaurostoffel 2024-12-20 20:57:10 +01:00
parent 66d048c70e
commit a8b856b714
4 changed files with 78 additions and 4 deletions

View File

@ -44,4 +44,10 @@ def property_capacities_data(id: int):
def property_base_data(id: int): def property_base_data(id: int):
return d.property_base_data(id).pl().to_dicts() return d.property_base_data(id).pl().to_dicts()
@app.get("/region/{id}/capacities")
def region_capacities_data(id: int):
capacities = etl_pc.region_capacities(id)
return capacities

View File

@ -28,8 +28,6 @@ class Database:
if(spatial_installed and not spatial_installed[0]): if(spatial_installed and not spatial_installed[0]):
self.connection.sql("INSTALL spatial") self.connection.sql("INSTALL spatial")
def db_overview(self): def db_overview(self):
return self.connection.sql("DESCRIBE;").show() return self.connection.sql("DESCRIBE;").show()
@ -403,5 +401,20 @@ class Database:
consultancy_d.properties p consultancy_d.properties p
""") """)
def capacity_of_region(self, region_id):
return self.connection.sql(f"""
SELECT
JSON_EXTRACT(body, '$.content.days') as calendarBody,
strftime(extractions.created_at, '%Y-%m-%d') AS ScrapeDate,
extractions.property_id,
FROM
consultancy_d.extractions
LEFT JOIN
consultancy_d.properties ON properties.id = extractions.property_id
WHERE
type == 'calendar' AND
properties.seed_id = {region_id}
""")

View File

@ -0,0 +1,57 @@
from etl.src import data
import polars as pl
from io import StringIO
import numpy as np
d = data.load()
def region_capacities(id: int):
# Get Data
df = d.capacity_of_region(id).pl()
# turn PropertyIDs to ints for sorting
df = df.cast({"property_id": int})
# Get uniques for dates and propIDs and sort them
listOfDates = df.get_column("ScrapeDate").unique().sort()
listOfPropertyIDs = df.get_column("property_id").unique().sort()
# Create DFs from lists to merge later
datesDF = pl.DataFrame(listOfDates).with_row_index("date_index")
propIdDF = pl.DataFrame(listOfPropertyIDs).with_row_index("prop_index")
# Merge Dataframe to generate indices
df = df.join(datesDF, on='ScrapeDate')
df = df.join(propIdDF, on='property_id')
# Drop now useless columns ScrapeDate and property_id
df = df[['calendarBody', 'date_index', 'prop_index']]
# Calculate grid values
gridData = []
for row in df.rows(named=True):
# Return 0 for sum if calendar is null
if row['calendarBody']:
calDF = pl.read_json(StringIO(row['calendarBody']))
sum_hor = calDF.sum_horizontal()[0]
else:
sum_hor = 0
gridData.append([row['prop_index'], row['date_index'], sum_hor])
gridData = np.array(gridData)
# get all values to calculate Max
allValues = gridData[:, 2]
maxValue = np.max(allValues)
gridData[:, 2] = (gridData[:, 2]*100)/maxValue
# Return back to list
gridData = gridData.tolist()
# Cast listOfDates to datetime
listOfDates = listOfDates.cast(pl.Date).to_list()
listOfPropertyIDs = listOfPropertyIDs.to_list()
# Create JSON
outDict = {'scrapeDates': listOfDates, 'property_ids': listOfPropertyIDs, 'values': gridData}
return outDict

View File

@ -31,8 +31,6 @@ dfNew = pl.from_dicts(data)
dfNew.write_csv('results/data_quality.csv') dfNew.write_csv('results/data_quality.csv')
print(dfNew) print(dfNew)
''' '''
dfNew = pl.read_csv('results/data_quality.csv') dfNew = pl.read_csv('results/data_quality.csv')
dfNew = dfNew.with_columns(pl.col("created_at").map_elements(lambda x: datetime.strptime(x, "%Y-%m-%d").date())) dfNew = dfNew.with_columns(pl.col("created_at").map_elements(lambda x: datetime.strptime(x, "%Y-%m-%d").date()))