ConsultancyProject_2_ETL/etl/src/data/etl_property_neighbours.py
mmaurostoffel cdb92ac50f Aufräumen des etl_property_neighbours
Kommentare gelöscht und haversineFormel angepasst, dass sie gleich wie in der Quelle ist
2025-01-19 11:54:22 +01:00

73 lines
2.2 KiB
Python

from math import asin, atan2, cos, degrees, radians, sin, sqrt
import polars as pl
import data
from data import etl_cache
d = data.load()
def calcHaversinDistance(latMain, lonMain, lat, lon):
R = 6371
# convert decimal degrees to radians
latMain, lonMain, lat, lon = map(radians, [latMain, lonMain, lat, lon])
# haversine formula
dlon = lonMain - lon
dlat = latMain - lat
a = sin(dlat / 2) ** 2 + cos(lat) * cos(latMain) * sin(dlon / 2) ** 2
c = 2 * atan2(sqrt(a), sqrt(1-a))
d = R * c
return d
def property_neighbours(id: int):
file = f"etl_property_neighbours_{id}.obj"
obj = etl_cache.openObj(file)
if obj:
return obj
extractions = d.properties_geo_seeds().pl()
# Get lat, long and region from main property
latMain, lonMain = extractions.filter(pl.col('id') == str(id))['coordinates'][0].split(',')
latMain, lonMain = map(float, [latMain, lonMain])
region = extractions.filter(pl.col('id') == str(id))['seed_id'][0]
# Prefilter the dataframe to only the correct region
extractions = extractions.filter(pl.col('seed_id') == str(region))
extractions = extractions.drop('seed_id')
# Remove main property from DF
extractions = extractions.filter(pl.col('id') != str(id))
# Split coordinate into lat and lon
extractions = extractions.with_columns(pl.col("coordinates").str.split_exact(",", 1).struct.rename_fields(["lat", "lon"]).alias("lat/lon")).unnest("lat/lon")
extractions = extractions.drop('coordinates')
extractions = extractions.with_columns(pl.col("lat").cast(pl.Float32))
extractions = extractions.with_columns(pl.col("lon").cast(pl.Float32))
# Calculate distances
distances = []
for row in extractions.rows(named=True):
lat = row['lat']
lon = row['lon']
dist = calcHaversinDistance(latMain, lonMain, lat, lon)
distances.append(dist)
# Add distance to DF
extractions = extractions.with_columns(pl.Series(name="distances", values=distances))
# Sort for distance and give only first 10
extractions = extractions.sort("distances").head(10)
extractions = extractions.drop('distances')
result = extractions.to_dicts()
etl_cache.saveObj(file, result)
return result