diff --git a/etl/src/data/database.py b/etl/src/data/database.py index b5c39f0..b5f2f1b 100644 --- a/etl/src/data/database.py +++ b/etl/src/data/database.py @@ -418,6 +418,16 @@ class Database: consultancy_d.properties p """) + def properties_geo_seeds(self): + return self.connection.sql(""" + SELECT + p.id, + p.seed_id, + p.check_data as coordinates + FROM + consultancy_d.properties p + """) + def capacity_of_region(self, region_id): return self.connection.sql(f""" SELECT diff --git a/etl/src/data/etl_property_neighbours.py b/etl/src/data/etl_property_neighbours.py new file mode 100644 index 0000000..ccd3ec7 --- /dev/null +++ b/etl/src/data/etl_property_neighbours.py @@ -0,0 +1,64 @@ +import polars as pl +from math import radians, cos, sin, asin, sqrt, degrees, atan2 + +import data + +d = data.load() + + +def calcHaversinDistance(latMain, lonMain, lat, lon): + R = 6371 + + # convert decimal degrees to radians + latMain, lonMain, lat, lon = map(radians, [latMain, lonMain, lat, lon]) + + # haversine formula + dlon = lonMain - lon + dlat = latMain - lat + + a = sin(dlat / 2) ** 2 + cos(lat) * cos(latMain) * sin(dlon / 2) ** 2 + c = 2 * asin(sqrt(a)) # 2 * atan2(sqrt(a), sqrt(1-a)) + d = R * c + + return d + +def property_neighbours(id: int): + extractions = d.properties_geo_seeds().pl() + + # Get lat, long and region from main property + latMain, lonMain = extractions.filter(pl.col('id') == str(id))['coordinates'][0].split(',') + latMain, lonMain = map(float, [latMain, lonMain]) + region = extractions.filter(pl.col('id') == str(id))['seed_id'][0] + + # Prefilter the dataframe to only the correct region + extractions = extractions.filter(pl.col('seed_id') == str(region)) + extractions = extractions.drop('seed_id') + + # Remove main property from DF + extractions = extractions.filter(pl.col('id') != str(id)) + + # Split coordinate into lat and lon + #extractions = extractions.with_columns((pl.col('coordinates').str.split(','))[0].alias("coordinates")).unnest("fields") + extractions = extractions.with_columns(pl.col("coordinates").str.split_exact(",", 1).struct.rename_fields(["lat", "lon"]).alias("lat/lon")).unnest("lat/lon") + extractions = extractions.drop('coordinates') + extractions = extractions.with_columns(pl.col("lat").cast(pl.Float32)) + extractions = extractions.with_columns(pl.col("lon").cast(pl.Float32)) + + # Calculate distances + distances = [] + for row in extractions.rows(named=True): + lat = row['lat'] + lon = row['lon'] + dist = calcHaversinDistance(latMain, lonMain, lat, lon) + distances.append(dist) + + # Add distance to DF + extractions = extractions.with_columns(pl.Series(name="distances", values=distances)) + + # Sort for distance and give only first 10 + extractions = extractions.sort("distances").head(10) + extractions = extractions.drop('distances') + + result = {"ids": extractions['id'].to_list(), "lat": extractions['lat'].to_list(), "lon": extractions['lon'].to_list()} + + return result