from threading import Thread, current_thread import duckdb class Database: def check_duckdb_extensions(self, extension): return self.connection.execute(""" SELECT installed FROM duckdb_extensions() WHERE extension_name = $extension """, { "extension": extension } ).fetchone() def __init__(self, path): duckdb_connection = duckdb.connect(database = path, read_only=True) self.connection = duckdb_connection.cursor() # Install spatial extension if not already installed spatial_installed = self.check_duckdb_extensions(extension='spatial') if(spatial_installed and not spatial_installed[0]): self.connection.sql("INSTALL spatial") def db_overview(self): return self.connection.sql("DESCRIBE;").show() def seeds(self): return self.connection.sql(""" SELECT regions.name, seeds.uri FROM consultancy_d.regions LEFT JOIN consultancy_d.seeds ON regions.id = seeds.region_id; """).show() def properties_growth(self): return self.connection.sql(""" SELECT strftime(created_at, '%Y-%m-%d') AS date, COUNT(*) as properties_count FROM consultancy_d.properties GROUP BY date; """) def properties_per_region(self): return self.connection.sql(""" SELECT regions.name, COUNT(*) AS count_properties FROM consultancy_d.properties LEFT JOIN consultancy_d.seeds ON seeds.id = properties.seed_id LEFT JOIN consultancy_d.regions ON regions.id = seeds.region_id GROUP BY properties.seed_id, regions.name """) def propIds_with_region(self): return self.connection.sql(""" SELECT properties.id, seed_id, regions.name FROM consultancy_d.properties LEFT JOIN consultancy_d.seeds ON seeds.id = properties.seed_id LEFT JOIN consultancy_d.regions ON regions.id = seeds.region_id """) def properties_unreachable(self): return self.connection.sql(""" SELECT entity_id, strftime(properties.created_at, '%Y-%m-%d') AS first_found, strftime(properties.last_found, '%Y-%m-%d') AS last_found FROM consultancy_d.exceptions LEFT JOIN consultancy_d.properties ON properties.id = exceptions.entity_id WHERE JSON_VALID(exception) = true AND JSON_EXTRACT(exception, '$.status') = '404' GROUP BY ALL ORDER BY last_found """).show() def properties_not_found(self): return self.connection.sql(""" SELECT COUNT(entity_id) as count_props, strftime(created_at, '%Y-%m-%d') as date FROM consultancy_d.exceptions WHERE JSON_VALID(exception) = true AND JSON_EXTRACT(exception, '$.status') > 400 GROUP BY date """).show() def properties_distance(self): return self.connection.sql(""" LOAD spatial; CREATE OR REPLACE VIEW geolocation_changes AS SELECT exceptions.entity_id, properties.check_data AS geolocation_original, SUBSTRING(exceptions.exception, 28) AS geolocation_new, ST_Distance_Sphere( ST_GeomFromText( CONCAT( 'POINT(', REPLACE(properties.check_data, ',', ' '), ')' ) ), ST_GeomFromText( CONCAT( 'POINT(', REPLACE(SUBSTRING(exceptions.exception, 28), ',', ' '), ')' ) ) ) AS distance FROM consultancy_d.exceptions LEFT JOIN consultancy_d.properties ON exceptions.entity_id = properties.id WHERE exception LIKE 'geoLocation was different%' GROUP BY entity_id, check_data, geolocation_new ORDER BY distance; SELECT * FROM geolocation_changes; SELECT '0 bis 25' AS category, COUNT(*) as count_properties FROM geolocation_changes WHERE distance >= (0) AND distance < (25) UNION SELECT '25 bis 50' AS category, COUNT(*) as count_properties FROM geolocation_changes WHERE distance >= (25) AND distance < (50) UNION SELECT '50 bis 75' AS category, COUNT(*) as count_properties FROM geolocation_changes WHERE distance >= (50) AND distance < (75) UNION SELECT '75 bis 100' AS category, COUNT(*) as count_properties FROM geolocation_changes WHERE distance >= (75) AND distance < (100); """) def properties_exceptions(self): return self.connection.sql(""" SELECT JSON_EXTRACT(exception, '$.status') AS exception_status, COUNT(JSON_EXTRACT(exception, '$.status')) AS exception_count FROM consultancy_d.exceptions WHERE type != 'property' GROUP BY JSON_EXTRACT(exception, '$.status') """) def extractions(self): return self.connection.sql(f""" SELECT JSON_EXTRACT(body, '$.content.days') as calendar, property_id, created_at FROM consultancy_d.extractions WHERE type == 'calendar' ORDER BY property_id """) def extractions_for(self, property_id): return self.connection.sql(f""" SELECT JSON_EXTRACT(body, '$.content.days') as calendar, property_id, created_at FROM consultancy_d.extractions WHERE type == 'calendar' AND property_id = {property_id} ORDER BY property_id """) # Anzahl der extrahierten properties pro Exktraktionsvorgang def properties_per_extraction(self, property_id): return self.connection.sql(""" SELECT COUNT(property_id), strftime(created_at, '%Y-%m-%d') AS date FROM consultancy_d.extractions WHERE type == 'calendar' GROUP BY date ORDER BY date ASC """) def price(self): return self.connection.sql(""" SELECT JSON_EXTRACT(body, '$.content.lowestPrice.valueWeekRaw') AS pricePerWeek, JSON_EXTRACT(body, '$.content.lowestPrice.valueNightRaw') AS pricePerNight, JSON_EXTRACT(body, '$.content.lowestPrice.currency') AS currency, property_id, created_at FROM consultancy_d.extractions WHERE type == 'price' ORDER BY property_id """) def price_developement_per_property(self): return self.connection.sql(""" SELECT JSON_EXTRACT(body, '$.content.lowestPrice.valueNightRaw') AS pricePerNight, property_id, created_at FROM consultancy_d.extractions WHERE type == 'price' ORDER BY property_id """)