From 8bcc1c57b500a96fbe44718caa716f4afacb1d4d Mon Sep 17 00:00:00 2001 From: mmaurostoffel <166130318+mmaurostoffel@users.noreply.github.com> Date: Sun, 5 Jan 2025 17:25:29 +0100 Subject: [PATCH] Gitea Issue 1 Beispiel 2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit https://gitea.fhgr.ch/stoffelmauro/ConsultancyProject_2_ETL/issues/1 etl_region_capacities_comparison eingefügt --- etl/src/api/main.py | 6 +- etl/src/data/database.py | 17 ++++- etl/src/data/etl_region_capacities.py | 2 - .../data/etl_region_capacities_comparison.py | 68 +++++++++++++++++++ 4 files changed, 89 insertions(+), 4 deletions(-) create mode 100644 etl/src/data/etl_region_capacities_comparison.py diff --git a/etl/src/api/main.py b/etl/src/api/main.py index 226a515..cc26adf 100644 --- a/etl/src/api/main.py +++ b/etl/src/api/main.py @@ -2,6 +2,7 @@ import data import polars as pl from data import etl_property_capacities as etl_pc from data import etl_region_capacities as etl_rc +from data import etl_region_capacities_comparison as etl_rcc from fastapi import FastAPI, Response d = data.load() @@ -50,5 +51,8 @@ def region_capacities_data(id: int): capacities = etl_rc.region_capacities(id) return capacities - +@app.get("/region/capacities/comparison/{id_1}/{id_2}") +def region_capacities_data(id_1: int, id_2: int): + capacities = etl_rcc.region_capacities_comparison(id_1, id_2) + return capacities diff --git a/etl/src/data/database.py b/etl/src/data/database.py index 4839824..8cabbca 100644 --- a/etl/src/data/database.py +++ b/etl/src/data/database.py @@ -430,5 +430,20 @@ class Database: type == 'calendar' """) - + def capacity_comparison_of_region(self, region_id_1, region_id_2): + return self.connection.sql(f""" + SELECT + JSON_EXTRACT(body, '$.content.days') as calendarBody, + strftime(extractions.created_at, '%Y-%m-%d') AS ScrapeDate, + extractions.property_id, + properties.seed_id + FROM + consultancy_d.extractions + LEFT JOIN + consultancy_d.properties ON properties.id = extractions.property_id + WHERE + type == 'calendar' AND + (properties.seed_id = {region_id_1} OR + properties.seed_id = {region_id_2}) + """) diff --git a/etl/src/data/etl_region_capacities.py b/etl/src/data/etl_region_capacities.py index 52ca0e8..1704258 100644 --- a/etl/src/data/etl_region_capacities.py +++ b/etl/src/data/etl_region_capacities.py @@ -45,9 +45,7 @@ def region_capacities(id: int): gridData = np.array(gridData) # get all values to calculate Max allValues = gridData[:, 2].astype(int) - print(allValues) maxValue = np.max(allValues) - print(maxValue) gridData[:, 2] = (allValues*100)/maxValue # Return back to list diff --git a/etl/src/data/etl_region_capacities_comparison.py b/etl/src/data/etl_region_capacities_comparison.py new file mode 100644 index 0000000..f06f2bf --- /dev/null +++ b/etl/src/data/etl_region_capacities_comparison.py @@ -0,0 +1,68 @@ +import data +import polars as pl +from io import StringIO +import numpy as np + + +d = data.load() + +def region_capacities_comparison(id_1: int, id_2: int): + fulldf = d.capacity_comparison_of_region(id_1, id_2).pl() + # turn PropertyIDs and seedIDs to ints for sorting and filtering + fulldf = fulldf.cast({"property_id": int}) + fulldf = fulldf.cast({"seed_id": int}) + df_region1 = fulldf.filter(pl.col("seed_id") == id_1) + df_region2 = fulldf.filter(pl.col("seed_id") == id_2) + df_list = [df_region1, df_region2] + outDictList = [] + + for df in df_list: + # Get uniques for dates and propIDs and sort them + listOfDates = df.get_column("ScrapeDate").unique().sort() + listOfPropertyIDs = df.get_column("property_id").unique().sort() + + # Create DFs from lists to merge later + datesDF = pl.DataFrame(listOfDates).with_row_index("date_index") + propIdDF = pl.DataFrame(listOfPropertyIDs).with_row_index("prop_index") + + # Merge Dataframe to generate indices + df = df.join(datesDF, on='ScrapeDate') + df = df.join(propIdDF, on='property_id') + # Drop now useless columns ScrapeDate and property_id + df = df[['ScrapeDate', 'calendarBody', 'date_index', 'prop_index']] + # Calculate grid values + gridData = [] + for row in df.rows(named=True): + # Return 0 for sum if calendar is null + if row['calendarBody']: + calDF = pl.read_json(StringIO(row['calendarBody'])) + sum_hor = calDF.sum_horizontal()[0] + else: + sum_hor = 0 + # With Index + # gridData.append([row['prop_index'], row['date_index'], sum_hor]) + # With ScrapeDate + gridData.append([row['ScrapeDate'], row['date_index'], sum_hor]) + + gridData = np.array(gridData) + # get all values to calculate Max + allValues = gridData[:, 2].astype(int) + maxValue = np.max(allValues) + gridData[:, 2] = (allValues*100)/maxValue + + # Return back to list + gridData = gridData.tolist() + + # Cast listOfDates to datetime + listOfDates = listOfDates.cast(pl.Date).to_list() + listOfPropertyIDs = listOfPropertyIDs.to_list() + + # Create JSON + tempDict = {'scrapeDates': listOfDates, 'property_ids': listOfPropertyIDs, 'values': gridData} + outDictList.append(tempDict) + + outDict = {'region1': outDictList[0], 'region2': outDictList[1],} + return outDict + +out = region_capacities_comparison(1,2) +print(out) \ No newline at end of file