283 lines
6.2 KiB
Python
Raw Normal View History

from threading import Thread, current_thread
import duckdb
class Database:
def check_duckdb_extensions(self, extension):
return self.connection.execute("""
SELECT
installed
FROM
duckdb_extensions()
WHERE
extension_name = $extension
""",
{
"extension": extension
}
).fetchone()
2024-10-27 16:10:49 +01:00
def __init__(self, path):
duckdb_connection = duckdb.connect(database = path, read_only=True)
self.connection = duckdb_connection.cursor()
# Install spatial extension if not already installed
spatial_installed = self.check_duckdb_extensions(extension='spatial')
if(spatial_installed and not spatial_installed[0]):
self.connection.sql("INSTALL spatial")
def db_overview(self):
return self.connection.sql("DESCRIBE;").show()
def seeds(self):
2024-10-26 18:02:53 +02:00
return self.connection.sql("""
SELECT
regions.name,
seeds.uri
FROM
consultancy_d.regions
LEFT JOIN
consultancy_d.seeds ON regions.id = seeds.region_id;
""").show()
def properties_growth(self):
2024-10-26 18:02:53 +02:00
return self.connection.sql("""
SELECT
strftime(created_at, '%Y-%m-%d') AS date,
COUNT(*) as properties_count
FROM
consultancy_d.properties
GROUP BY
date;
""")
2024-10-26 17:31:46 +02:00
def properties_per_region(self):
2024-10-26 18:02:53 +02:00
return self.connection.sql("""
SELECT
regions.name,
COUNT(*) AS count_properties
FROM
consultancy_d.properties
LEFT JOIN
consultancy_d.seeds ON seeds.id = properties.seed_id
LEFT JOIN
consultancy_d.regions ON regions.id = seeds.region_id
GROUP BY
properties.seed_id,
regions.name
""")
2024-10-26 18:02:53 +02:00
def propIds_with_region(self):
return self.connection.sql("""
SELECT
properties.id, seed_id, regions.name
FROM
consultancy_d.properties
LEFT JOIN
consultancy_d.seeds ON seeds.id = properties.seed_id
LEFT JOIN
consultancy_d.regions ON regions.id = seeds.region_id
""")
2024-10-26 18:02:53 +02:00
def properties_unreachable(self):
return self.connection.sql("""
SELECT
entity_id,
strftime(properties.created_at, '%Y-%m-%d') AS first_found,
strftime(properties.last_found, '%Y-%m-%d') AS last_found
FROM
consultancy_d.exceptions
LEFT JOIN
consultancy_d.properties ON properties.id = exceptions.entity_id
WHERE
JSON_VALID(exception) = true AND
JSON_EXTRACT(exception, '$.status') = '404'
GROUP BY ALL
ORDER BY
last_found
""").show()
def properties_not_found(self):
return self.connection.sql("""
SELECT
COUNT(entity_id) as count_props,
strftime(created_at, '%Y-%m-%d') as date
FROM
consultancy_d.exceptions
WHERE
JSON_VALID(exception) = true AND
JSON_EXTRACT(exception, '$.status') > 400
GROUP BY
date
""").show()
2024-10-26 17:31:46 +02:00
def properties_distance(self):
return self.connection.sql("""
LOAD spatial;
CREATE OR REPLACE VIEW geolocation_changes AS
SELECT
exceptions.entity_id,
properties.check_data AS geolocation_original,
SUBSTRING(exceptions.exception, 28) AS geolocation_new,
ST_Distance_Sphere(
ST_GeomFromText(
CONCAT(
'POINT(',
REPLACE(properties.check_data, ',', ' '),
')'
)
),
ST_GeomFromText(
CONCAT(
'POINT(',
REPLACE(SUBSTRING(exceptions.exception, 28), ',', ' '),
')'
)
)
) AS distance
FROM
consultancy_d.exceptions
LEFT JOIN
consultancy_d.properties ON exceptions.entity_id = properties.id
WHERE
exception LIKE 'geoLocation was different%'
GROUP BY
entity_id,
check_data,
geolocation_new
ORDER BY
distance;
SELECT * FROM geolocation_changes;
SELECT
'0 bis 25' AS category,
COUNT(*) as count_properties
FROM
geolocation_changes
WHERE
distance >= (0)
AND distance < (25)
UNION
SELECT
'25 bis 50' AS category,
COUNT(*) as count_properties
FROM
geolocation_changes
WHERE
distance >= (25)
AND distance < (50)
UNION
SELECT
'50 bis 75' AS category,
COUNT(*) as count_properties
FROM
geolocation_changes
WHERE
distance >= (50)
AND distance < (75)
UNION
SELECT
'75 bis 100' AS category,
COUNT(*) as count_properties
FROM
geolocation_changes
WHERE
distance >= (75)
AND distance < (100);
""")
def properties_exceptions(self):
return self.connection.sql("""
SELECT
JSON_EXTRACT(exception, '$.status') AS exception_status,
COUNT(JSON_EXTRACT(exception, '$.status')) AS exception_count
FROM
consultancy_d.exceptions
WHERE
type != 'property'
GROUP BY
JSON_EXTRACT(exception, '$.status')
""")
def extractions(self):
return self.connection.sql(f"""
SELECT
JSON_EXTRACT(body, '$.content.days') as calendar,
property_id,
created_at
FROM
consultancy_d.extractions
WHERE
type == 'calendar'
ORDER BY
property_id
""")
def extractions_for(self, property_id):
return self.connection.sql(f"""
SELECT
JSON_EXTRACT(body, '$.content.days') as calendar,
property_id,
created_at
FROM
consultancy_d.extractions
WHERE
type == 'calendar' AND
property_id = {property_id}
ORDER BY
property_id
""")
# Anzahl der extrahierten properties pro Exktraktionsvorgang
def properties_per_extraction(self, property_id):
return self.connection.sql("""
SELECT
COUNT(property_id),
strftime(created_at, '%Y-%m-%d') AS date
FROM
consultancy_d.extractions
WHERE
type == 'calendar'
GROUP BY
date
ORDER BY date ASC
""")
def price(self):
return self.connection.sql("""
SELECT
JSON_EXTRACT(body, '$.content.lowestPrice.valueWeekRaw') AS pricePerWeek,
JSON_EXTRACT(body, '$.content.lowestPrice.valueNightRaw') AS pricePerNight,
JSON_EXTRACT(body, '$.content.lowestPrice.currency') AS currency,
property_id,
created_at
FROM
consultancy_d.extractions
WHERE
type == 'price'
ORDER BY property_id
""")
def price_developement_per_property(self):
return self.connection.sql("""
SELECT
JSON_EXTRACT(body, '$.content.lowestPrice.valueNightRaw') AS pricePerNight,
property_id,
created_at
FROM
consultancy_d.extractions
WHERE
type == 'price'
ORDER BY property_id
""")