421 lines
9.5 KiB
Python
421 lines
9.5 KiB
Python
from threading import Thread, current_thread
|
|
|
|
import duckdb
|
|
|
|
|
|
class Database:
|
|
|
|
def check_duckdb_extensions(self, extension):
|
|
return self.connection.execute("""
|
|
SELECT
|
|
installed
|
|
FROM
|
|
duckdb_extensions()
|
|
WHERE
|
|
extension_name = $extension
|
|
""",
|
|
{
|
|
"extension": extension
|
|
}
|
|
).fetchone()
|
|
|
|
def __init__(self, path):
|
|
duckdb_connection = duckdb.connect(database = path, read_only=True)
|
|
self.connection = duckdb_connection.cursor()
|
|
|
|
# Install spatial extension if not already installed
|
|
spatial_installed = self.check_duckdb_extensions(extension='spatial')
|
|
if(spatial_installed and not spatial_installed[0]):
|
|
self.connection.sql("INSTALL spatial")
|
|
|
|
def db_overview(self):
|
|
return self.connection.sql("DESCRIBE;").show()
|
|
|
|
def seeds(self):
|
|
return self.connection.sql("""
|
|
SELECT
|
|
regions.name,
|
|
seeds.uri
|
|
FROM
|
|
consultancy_d.regions
|
|
LEFT JOIN
|
|
consultancy_d.seeds ON regions.id = seeds.region_id;
|
|
""").show()
|
|
|
|
def properties_growth(self):
|
|
return self.connection.sql("""
|
|
WITH PropertiesALL AS (
|
|
SELECT
|
|
strftime(created_at, '%Y-%m-%d') AS date,
|
|
COUNT(*) as properties_count,
|
|
SUM(properties_count) OVER (ORDER BY date) AS total
|
|
FROM
|
|
consultancy_d.properties p
|
|
GROUP BY
|
|
date
|
|
ORDER BY
|
|
date
|
|
),
|
|
PropertiesR1 AS (
|
|
SELECT
|
|
strftime(created_at, '%Y-%m-%d') AS date,
|
|
COUNT(*) as properties_count,
|
|
SUM(properties_count) OVER (ORDER BY date) AS total
|
|
FROM
|
|
consultancy_d.properties p
|
|
WHERE
|
|
p.seed_id = 1
|
|
GROUP BY
|
|
date
|
|
ORDER BY
|
|
date
|
|
),
|
|
PropertiesR2 AS (
|
|
SELECT
|
|
strftime(created_at, '%Y-%m-%d') AS date,
|
|
COUNT(*) as properties_count,
|
|
SUM(properties_count) OVER (ORDER BY date) AS total
|
|
FROM
|
|
consultancy_d.properties p
|
|
WHERE
|
|
p.seed_id = 2
|
|
GROUP BY
|
|
date
|
|
ORDER BY
|
|
date
|
|
),
|
|
PropertiesR3 AS (
|
|
SELECT
|
|
strftime(created_at, '%Y-%m-%d') AS date,
|
|
COUNT(*) as properties_count,
|
|
SUM(properties_count) OVER (ORDER BY date) AS total
|
|
FROM
|
|
consultancy_d.properties p
|
|
WHERE
|
|
p.seed_id = 3
|
|
GROUP BY
|
|
date
|
|
ORDER BY
|
|
date
|
|
),
|
|
PropertiesR4 AS (
|
|
SELECT
|
|
strftime(created_at, '%Y-%m-%d') AS date,
|
|
COUNT(*) as properties_count,
|
|
SUM(properties_count) OVER (ORDER BY date) AS total
|
|
FROM
|
|
consultancy_d.properties p
|
|
WHERE
|
|
p.seed_id = 4
|
|
GROUP BY
|
|
date
|
|
ORDER BY
|
|
date
|
|
)
|
|
SELECT
|
|
p.date,
|
|
p.total AS total_all,
|
|
pR1.total as total_heidiland,
|
|
pR2.total AS total_davos,
|
|
pR3.total AS total_engadin,
|
|
pR4.total AS total_stmoritz
|
|
FROM
|
|
PropertiesAll p
|
|
LEFT JOIN
|
|
PropertiesR1 pR1 ON p.date = pR1.date
|
|
LEFT JOIN
|
|
PropertiesR2 pR2 ON p.date = pR2.date
|
|
LEFT JOIN
|
|
PropertiesR3 pR3 ON p.date = pR3.date
|
|
LEFT JOIN
|
|
PropertiesR4 pR4 ON p.date = pR4.date
|
|
ORDER BY
|
|
p.date
|
|
""")
|
|
|
|
def properties_per_region(self):
|
|
return self.connection.sql("""
|
|
SELECT
|
|
regions.name,
|
|
COUNT(*) AS count_properties
|
|
FROM
|
|
consultancy_d.properties
|
|
LEFT JOIN
|
|
consultancy_d.seeds ON seeds.id = properties.seed_id
|
|
LEFT JOIN
|
|
consultancy_d.regions ON regions.id = seeds.region_id
|
|
GROUP BY
|
|
properties.seed_id,
|
|
regions.name
|
|
ORDER BY
|
|
count_properties ASC
|
|
""")
|
|
|
|
def propIds_with_region(self):
|
|
return self.connection.sql("""
|
|
SELECT
|
|
properties.id, seed_id, regions.name
|
|
FROM
|
|
consultancy_d.properties
|
|
LEFT JOIN
|
|
consultancy_d.seeds ON seeds.id = properties.seed_id
|
|
LEFT JOIN
|
|
consultancy_d.regions ON regions.id = seeds.region_id
|
|
""")
|
|
|
|
def properties_unreachable(self):
|
|
return self.connection.sql("""
|
|
SELECT
|
|
entity_id,
|
|
strftime(properties.created_at, '%Y-%m-%d') AS first_found,
|
|
strftime(properties.last_found, '%Y-%m-%d') AS last_found
|
|
FROM
|
|
consultancy_d.exceptions
|
|
LEFT JOIN
|
|
consultancy_d.properties ON properties.id = exceptions.entity_id
|
|
WHERE
|
|
JSON_VALID(exception) = true AND
|
|
JSON_EXTRACT(exception, '$.status') = '404'
|
|
GROUP BY ALL
|
|
ORDER BY
|
|
last_found
|
|
""").show()
|
|
|
|
def properties_not_found(self):
|
|
return self.connection.sql("""
|
|
SELECT
|
|
COUNT(entity_id) as count_props,
|
|
strftime(created_at, '%Y-%m-%d') as date
|
|
FROM
|
|
consultancy_d.exceptions
|
|
WHERE
|
|
JSON_VALID(exception) = true AND
|
|
JSON_EXTRACT(exception, '$.status') > 400
|
|
GROUP BY
|
|
date
|
|
""").show()
|
|
|
|
def properties_distance(self):
|
|
|
|
return self.connection.sql("""
|
|
LOAD spatial;
|
|
|
|
CREATE OR REPLACE VIEW geolocation_changes AS
|
|
SELECT
|
|
exceptions.entity_id,
|
|
properties.check_data AS geolocation_original,
|
|
SUBSTRING(exceptions.exception, 28) AS geolocation_new,
|
|
ST_Distance_Sphere(
|
|
ST_GeomFromText(
|
|
CONCAT(
|
|
'POINT(',
|
|
REPLACE(properties.check_data, ',', ' '),
|
|
')'
|
|
)
|
|
),
|
|
ST_GeomFromText(
|
|
CONCAT(
|
|
'POINT(',
|
|
REPLACE(SUBSTRING(exceptions.exception, 28), ',', ' '),
|
|
')'
|
|
)
|
|
)
|
|
) AS distance
|
|
FROM
|
|
consultancy_d.exceptions
|
|
LEFT JOIN
|
|
consultancy_d.properties ON exceptions.entity_id = properties.id
|
|
WHERE
|
|
exception LIKE 'geoLocation was different%'
|
|
GROUP BY
|
|
entity_id,
|
|
check_data,
|
|
geolocation_new
|
|
ORDER BY
|
|
distance;
|
|
|
|
SELECT * FROM geolocation_changes;
|
|
|
|
SELECT
|
|
'0 bis 25' AS category,
|
|
COUNT(*) as count_properties
|
|
FROM
|
|
geolocation_changes
|
|
WHERE
|
|
distance >= (0)
|
|
AND distance < (25)
|
|
UNION
|
|
SELECT
|
|
'25 bis 50' AS category,
|
|
COUNT(*) as count_properties
|
|
FROM
|
|
geolocation_changes
|
|
WHERE
|
|
distance >= (25)
|
|
AND distance < (50)
|
|
UNION
|
|
SELECT
|
|
'50 bis 75' AS category,
|
|
COUNT(*) as count_properties
|
|
FROM
|
|
geolocation_changes
|
|
WHERE
|
|
distance >= (50)
|
|
AND distance < (75)
|
|
UNION
|
|
SELECT
|
|
'75 bis 100' AS category,
|
|
COUNT(*) as count_properties
|
|
FROM
|
|
geolocation_changes
|
|
WHERE
|
|
distance >= (75)
|
|
AND distance < (100);
|
|
|
|
""")
|
|
|
|
def properties_exceptions(self):
|
|
return self.connection.sql("""
|
|
SELECT
|
|
JSON_EXTRACT(exception, '$.status') AS exception_status,
|
|
COUNT(JSON_EXTRACT(exception, '$.status')) AS exception_count
|
|
FROM
|
|
consultancy_d.exceptions
|
|
WHERE
|
|
type != 'property'
|
|
GROUP BY
|
|
JSON_EXTRACT(exception, '$.status')
|
|
""")
|
|
|
|
def extractions(self):
|
|
return self.connection.sql("""
|
|
SELECT
|
|
JSON_EXTRACT(body, '$.content.days') as calendar,
|
|
property_id,
|
|
created_at
|
|
FROM
|
|
consultancy_d.extractions
|
|
WHERE
|
|
type == 'calendar'
|
|
ORDER BY
|
|
property_id
|
|
""")
|
|
|
|
def extractions_with_region(self):
|
|
return self.connection.sql("""
|
|
SELECT
|
|
JSON_EXTRACT(body, '$.content.days') as calendar,
|
|
extractions.property_id,
|
|
extractions.created_at,
|
|
properties.seed_id,
|
|
regions.name
|
|
FROM
|
|
consultancy_d.extractions
|
|
LEFT JOIN
|
|
consultancy_d.properties ON properties.id = extractions.property_id
|
|
LEFT JOIN
|
|
consultancy_d.seeds ON seeds.id = properties.seed_id
|
|
LEFT JOIN
|
|
consultancy_d.regions ON regions.id = seeds.region_id
|
|
""")
|
|
|
|
def extractions_for(self, property_id):
|
|
return self.connection.sql(f"""
|
|
SELECT
|
|
JSON_EXTRACT(body, '$.content.days') as calendar,
|
|
created_at
|
|
FROM
|
|
consultancy_d.extractions
|
|
WHERE
|
|
type == 'calendar' AND
|
|
property_id = {property_id} AND
|
|
calendar NOT NULL
|
|
ORDER BY
|
|
created_at
|
|
""")
|
|
|
|
# Anzahl der extrahierten properties pro Exktraktionsvorgang
|
|
def properties_per_extraction(self, property_id):
|
|
return self.connection.sql("""
|
|
SELECT
|
|
COUNT(property_id),
|
|
strftime(created_at, '%Y-%m-%d') AS date
|
|
FROM
|
|
consultancy_d.extractions
|
|
WHERE
|
|
type == 'calendar'
|
|
GROUP BY
|
|
date
|
|
ORDER BY date ASC
|
|
""")
|
|
|
|
def price(self):
|
|
return self.connection.sql("""
|
|
SELECT
|
|
JSON_EXTRACT(body, '$.content.lowestPrice.valueWeekRaw') AS pricePerWeek,
|
|
JSON_EXTRACT(body, '$.content.lowestPrice.valueNightRaw') AS pricePerNight,
|
|
JSON_EXTRACT(body, '$.content.lowestPrice.currency') AS currency,
|
|
property_id,
|
|
created_at
|
|
FROM
|
|
consultancy_d.extractions
|
|
WHERE
|
|
type == 'price'
|
|
ORDER BY property_id
|
|
""")
|
|
|
|
def price_developement_per_property(self):
|
|
return self.connection.sql("""
|
|
SELECT
|
|
JSON_EXTRACT(body, '$.content.lowestPrice.valueNightRaw') AS pricePerNight,
|
|
property_id,
|
|
created_at
|
|
FROM
|
|
consultancy_d.extractions
|
|
WHERE
|
|
type == 'price'
|
|
ORDER BY property_id
|
|
""")
|
|
|
|
def property_base_data(self, id):
|
|
return self.connection.sql(f"""
|
|
SELECT
|
|
p.property_platform_id,
|
|
p.created_at as first_found,
|
|
p.last_found,
|
|
r.name as region_name
|
|
FROM
|
|
consultancy_d.properties p
|
|
INNER JOIN consultancy_d.seeds s ON s.id = p.seed_id
|
|
INNER JOIN consultancy_d.regions r ON s.region_id = r.id
|
|
WHERE
|
|
p.id = {id}
|
|
""")
|
|
|
|
def properties_geo(self):
|
|
return self.connection.sql("""
|
|
SELECT
|
|
p.id,
|
|
p.check_data as coordinates
|
|
FROM
|
|
consultancy_d.properties p
|
|
""")
|
|
|
|
def capacity_of_region(self, region_id):
|
|
return self.connection.sql(f"""
|
|
SELECT
|
|
JSON_EXTRACT(body, '$.content.days') as calendarBody,
|
|
strftime(extractions.created_at, '%Y-%m-%d') AS ScrapeDate,
|
|
extractions.property_id,
|
|
FROM
|
|
consultancy_d.extractions
|
|
LEFT JOIN
|
|
consultancy_d.properties ON properties.id = extractions.property_id
|
|
WHERE
|
|
type == 'calendar' AND
|
|
properties.seed_id = {region_id}
|
|
""")
|
|
|
|
|
|
|