421 lines
		
	
	
		
			9.5 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			421 lines
		
	
	
		
			9.5 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| from threading import Thread, current_thread
 | |
| 
 | |
| import duckdb
 | |
| 
 | |
| 
 | |
| class Database:
 | |
| 
 | |
| 	def check_duckdb_extensions(self, extension):
 | |
| 		return self.connection.execute("""
 | |
| 			SELECT 
 | |
| 				installed
 | |
| 			FROM 
 | |
| 				duckdb_extensions()
 | |
| 			WHERE
 | |
| 				extension_name = $extension
 | |
| 			""",
 | |
| 			{
 | |
| 				"extension": extension
 | |
| 			}
 | |
| 		).fetchone()
 | |
| 
 | |
| 	def __init__(self, path):
 | |
| 		duckdb_connection = duckdb.connect(database = path, read_only=True)
 | |
| 		self.connection = duckdb_connection.cursor()
 | |
| 		
 | |
| 		# Install spatial extension if not already installed
 | |
| 		spatial_installed = self.check_duckdb_extensions(extension='spatial')
 | |
| 		if(spatial_installed and not spatial_installed[0]):
 | |
| 			self.connection.sql("INSTALL spatial")
 | |
| 
 | |
| 	def db_overview(self):
 | |
| 		return self.connection.sql("DESCRIBE;").show()
 | |
| 
 | |
| 	def seeds(self):
 | |
| 		return self.connection.sql("""
 | |
| 			SELECT 
 | |
| 				regions.name, 
 | |
| 				seeds.uri 
 | |
| 			FROM 
 | |
| 				consultancy_d.regions 
 | |
| 			LEFT JOIN 
 | |
| 				consultancy_d.seeds ON regions.id = seeds.region_id;
 | |
| 			""").show()
 | |
| 		
 | |
| 	def properties_growth(self):
 | |
| 		return self.connection.sql("""
 | |
| 			WITH PropertiesALL AS (
 | |
| 				SELECT 
 | |
| 					strftime(created_at, '%Y-%m-%d') AS date, 
 | |
| 					COUNT(*) as properties_count,
 | |
| 					SUM(properties_count) OVER (ORDER BY date) AS total
 | |
| 				FROM
 | |
| 					consultancy_d.properties p
 | |
| 				GROUP BY 
 | |
| 					date
 | |
| 				ORDER BY
 | |
| 					date
 | |
| 			),
 | |
| 			PropertiesR1 AS (
 | |
| 				SELECT 
 | |
| 					strftime(created_at, '%Y-%m-%d') AS date, 
 | |
| 					COUNT(*) as properties_count,
 | |
| 					SUM(properties_count) OVER (ORDER BY date) AS total
 | |
| 				FROM
 | |
| 					consultancy_d.properties p
 | |
| 				WHERE
 | |
| 					p.seed_id = 1
 | |
| 				GROUP BY 
 | |
| 					date
 | |
| 				ORDER BY
 | |
| 					date
 | |
| 			),
 | |
| 			PropertiesR2 AS (
 | |
| 				SELECT 
 | |
| 					strftime(created_at, '%Y-%m-%d') AS date, 
 | |
| 					COUNT(*) as properties_count,
 | |
| 					SUM(properties_count) OVER (ORDER BY date) AS total
 | |
| 				FROM
 | |
| 					consultancy_d.properties p
 | |
| 				WHERE
 | |
| 					p.seed_id = 2
 | |
| 				GROUP BY 
 | |
| 					date
 | |
| 				ORDER BY
 | |
| 					date
 | |
| 			),
 | |
| 			PropertiesR3 AS (
 | |
| 				SELECT 
 | |
| 					strftime(created_at, '%Y-%m-%d') AS date, 
 | |
| 					COUNT(*) as properties_count,
 | |
| 					SUM(properties_count) OVER (ORDER BY date) AS total
 | |
| 				FROM
 | |
| 					consultancy_d.properties p
 | |
| 				WHERE
 | |
| 					p.seed_id = 3
 | |
| 				GROUP BY 
 | |
| 					date
 | |
| 				ORDER BY
 | |
| 					date
 | |
| 			),
 | |
| 			PropertiesR4 AS (
 | |
| 				SELECT 
 | |
| 					strftime(created_at, '%Y-%m-%d') AS date, 
 | |
| 					COUNT(*) as properties_count,
 | |
| 					SUM(properties_count) OVER (ORDER BY date) AS total
 | |
| 				FROM
 | |
| 					consultancy_d.properties p
 | |
| 				WHERE
 | |
| 					p.seed_id = 4
 | |
| 				GROUP BY 
 | |
| 					date
 | |
| 				ORDER BY
 | |
| 					date
 | |
| 			)
 | |
| 			SELECT 
 | |
| 				p.date, 
 | |
| 				p.total AS total_all, 
 | |
| 				pR1.total as total_heidiland, 
 | |
| 				pR2.total AS total_davos, 
 | |
| 				pR3.total AS total_engadin, 
 | |
| 				pR4.total AS total_stmoritz  
 | |
| 			FROM 
 | |
| 				PropertiesAll p
 | |
| 			LEFT JOIN 
 | |
| 				PropertiesR1 pR1 ON p.date = pR1.date
 | |
| 			LEFT JOIN 
 | |
| 				PropertiesR2 pR2 ON p.date = pR2.date
 | |
| 			LEFT JOIN 
 | |
| 				PropertiesR3 pR3 ON p.date = pR3.date
 | |
| 			LEFT JOIN 
 | |
| 				PropertiesR4 pR4 ON p.date = pR4.date
 | |
| 			ORDER BY 
 | |
| 				p.date
 | |
| 			""")
 | |
| 	
 | |
| 	def properties_per_region(self):
 | |
| 		return self.connection.sql("""
 | |
| 			SELECT 
 | |
| 				regions.name, 
 | |
| 				COUNT(*) AS count_properties 
 | |
| 			FROM 
 | |
| 				consultancy_d.properties 
 | |
| 			LEFT JOIN 
 | |
| 				consultancy_d.seeds ON seeds.id = properties.seed_id 
 | |
| 			LEFT JOIN 
 | |
| 				consultancy_d.regions ON regions.id = seeds.region_id 
 | |
| 			GROUP BY 
 | |
| 				properties.seed_id, 
 | |
| 				regions.name
 | |
| 			ORDER BY
 | |
| 				count_properties ASC
 | |
| 			""")
 | |
| 
 | |
| 	def propIds_with_region(self):
 | |
| 		return self.connection.sql("""
 | |
| 			SELECT 
 | |
| 				properties.id, seed_id, regions.name
 | |
| 			FROM 
 | |
| 				consultancy_d.properties 
 | |
| 			LEFT JOIN 
 | |
| 				consultancy_d.seeds ON seeds.id = properties.seed_id 
 | |
| 			LEFT JOIN 
 | |
| 				consultancy_d.regions ON regions.id = seeds.region_id 
 | |
| 			""")
 | |
| 
 | |
| 	def properties_unreachable(self):
 | |
| 		return self.connection.sql("""
 | |
| 			SELECT 
 | |
| 				entity_id, 
 | |
| 				strftime(properties.created_at, '%Y-%m-%d') AS first_found, 
 | |
| 				strftime(properties.last_found, '%Y-%m-%d') AS last_found 
 | |
| 			FROM 
 | |
| 				consultancy_d.exceptions 
 | |
| 			LEFT JOIN 
 | |
| 				consultancy_d.properties ON properties.id = exceptions.entity_id 
 | |
| 			WHERE 
 | |
| 				JSON_VALID(exception) = true AND
 | |
| 				JSON_EXTRACT(exception, '$.status') = '404' 
 | |
| 			GROUP BY ALL
 | |
| 			ORDER BY 
 | |
| 				last_found
 | |
| 			""").show()
 | |
| 
 | |
| 	def properties_not_found(self):
 | |
| 		return self.connection.sql("""
 | |
| 			SELECT
 | |
| 			 	COUNT(entity_id) as count_props,
 | |
| 				strftime(created_at, '%Y-%m-%d') as date
 | |
| 			FROM
 | |
| 				consultancy_d.exceptions
 | |
| 			WHERE
 | |
| 				JSON_VALID(exception) = true AND
 | |
| 				JSON_EXTRACT(exception, '$.status') > 400
 | |
| 			GROUP BY
 | |
| 			  date
 | |
| 			""").show()
 | |
| 
 | |
| 	def properties_distance(self):
 | |
| 		
 | |
| 		return self.connection.sql("""
 | |
| 			LOAD spatial;
 | |
| 
 | |
| 			CREATE OR REPLACE VIEW  geolocation_changes AS
 | |
| 			SELECT
 | |
| 			  exceptions.entity_id,
 | |
| 			  properties.check_data AS geolocation_original,
 | |
| 			  SUBSTRING(exceptions.exception, 28) AS geolocation_new,
 | |
| 			  ST_Distance_Sphere(
 | |
| 			    ST_GeomFromText(
 | |
| 			      CONCAT(
 | |
| 			        'POINT(',
 | |
| 			        REPLACE(properties.check_data, ',', ' '),
 | |
| 			        ')'
 | |
| 			      )
 | |
| 			    ),
 | |
| 			    ST_GeomFromText(
 | |
| 			      CONCAT(
 | |
| 			        'POINT(',
 | |
| 			        REPLACE(SUBSTRING(exceptions.exception, 28), ',', ' '),
 | |
| 			        ')'
 | |
| 			      )
 | |
| 			    )
 | |
| 			  ) AS distance
 | |
| 			FROM
 | |
| 			  consultancy_d.exceptions
 | |
| 			LEFT JOIN 
 | |
| 			  	consultancy_d.properties ON exceptions.entity_id = properties.id
 | |
| 			WHERE
 | |
| 			  exception LIKE 'geoLocation was different%'
 | |
| 			GROUP BY
 | |
| 			  entity_id,
 | |
| 			  check_data,
 | |
| 			  geolocation_new
 | |
| 			ORDER BY
 | |
| 			  distance;
 | |
| 
 | |
| 			SELECT * FROM geolocation_changes;
 | |
| 
 | |
| 			SELECT
 | |
| 			  '0 bis 25' AS category,
 | |
| 			  COUNT(*) as count_properties
 | |
| 			FROM
 | |
| 			  geolocation_changes
 | |
| 			WHERE
 | |
| 			  distance >= (0)
 | |
| 			  AND distance < (25)
 | |
| 			UNION
 | |
| 			SELECT
 | |
| 			  '25 bis 50' AS category,
 | |
| 			  COUNT(*) as count_properties
 | |
| 			FROM
 | |
| 			  geolocation_changes
 | |
| 			WHERE
 | |
| 			  distance >= (25)
 | |
| 			  AND distance < (50)
 | |
| 			UNION
 | |
| 			SELECT
 | |
| 			  '50 bis 75' AS category,
 | |
| 			  COUNT(*) as count_properties
 | |
| 			FROM
 | |
| 			  geolocation_changes
 | |
| 			WHERE
 | |
| 			  distance >= (50)
 | |
| 			  AND distance < (75)
 | |
| 			UNION
 | |
| 			SELECT
 | |
| 			  '75 bis 100' AS category,
 | |
| 			  COUNT(*) as count_properties
 | |
| 			FROM
 | |
| 			  geolocation_changes
 | |
| 			WHERE
 | |
| 			  distance >= (75)
 | |
| 			  AND distance < (100);
 | |
| 			
 | |
| 			""")
 | |
| 
 | |
| 	def properties_exceptions(self):
 | |
| 		return self.connection.sql("""
 | |
| 			SELECT
 | |
| 			  JSON_EXTRACT(exception, '$.status') AS exception_status,
 | |
| 			  COUNT(JSON_EXTRACT(exception, '$.status')) AS exception_count
 | |
| 			FROM
 | |
| 			  consultancy_d.exceptions
 | |
| 			WHERE
 | |
| 			  type != 'property'
 | |
| 			GROUP BY
 | |
| 			  JSON_EXTRACT(exception, '$.status')
 | |
| 		""")
 | |
| 
 | |
| 	def extractions(self):
 | |
| 		return self.connection.sql("""
 | |
| 			SELECT 
 | |
| 				JSON_EXTRACT(body, '$.content.days') as calendar,
 | |
| 				property_id,
 | |
| 				created_at 
 | |
| 			FROM 
 | |
| 				consultancy_d.extractions
 | |
| 			WHERE
 | |
| 				type == 'calendar'
 | |
| 			ORDER BY
 | |
| 				property_id
 | |
| 		""")
 | |
| 
 | |
| 	def extractions_with_region(self):
 | |
| 		return self.connection.sql("""
 | |
| 			SELECT 
 | |
| 				JSON_EXTRACT(body, '$.content.days') as calendar, 
 | |
| 				extractions.property_id, 
 | |
| 				extractions.created_at,
 | |
| 				properties.seed_id, 
 | |
| 				regions.name
 | |
| 			FROM 
 | |
| 				consultancy_d.extractions 
 | |
| 			LEFT JOIN 
 | |
| 				consultancy_d.properties ON properties.id = extractions.property_id 
 | |
| 			LEFT JOIN 
 | |
| 				consultancy_d.seeds ON seeds.id = properties.seed_id 
 | |
| 			LEFT JOIN 
 | |
| 				consultancy_d.regions ON regions.id = seeds.region_id 
 | |
| 			""")
 | |
| 
 | |
| 	def extractions_for(self, property_id):
 | |
| 		return self.connection.sql(f"""
 | |
| 			SELECT 
 | |
| 				JSON_EXTRACT(body, '$.content.days') as calendar,
 | |
| 				created_at 
 | |
| 			FROM 
 | |
| 				consultancy_d.extractions
 | |
| 			WHERE
 | |
| 				type == 'calendar' AND
 | |
| 				property_id = {property_id} AND
 | |
| 				calendar NOT NULL
 | |
| 			ORDER BY
 | |
| 				created_at
 | |
| 		""")
 | |
| 
 | |
| 	# Anzahl der extrahierten properties pro Exktraktionsvorgang
 | |
| 	def properties_per_extraction(self, property_id):
 | |
| 		return self.connection.sql("""
 | |
| 			SELECT
 | |
| 				COUNT(property_id),
 | |
| 				strftime(created_at, '%Y-%m-%d') AS date
 | |
| 			FROM
 | |
| 			  consultancy_d.extractions
 | |
| 			WHERE
 | |
| 			  type == 'calendar'
 | |
| 			GROUP BY
 | |
| 				date
 | |
| 			ORDER BY date ASC
 | |
| 		""")
 | |
| 
 | |
| 	def price(self):
 | |
| 		return self.connection.sql("""
 | |
| 			SELECT 
 | |
| 				JSON_EXTRACT(body, '$.content.lowestPrice.valueWeekRaw') AS pricePerWeek,
 | |
| 				JSON_EXTRACT(body, '$.content.lowestPrice.valueNightRaw') AS pricePerNight,
 | |
| 				JSON_EXTRACT(body, '$.content.lowestPrice.currency') AS currency,
 | |
| 				property_id,
 | |
| 				created_at 
 | |
| 			FROM 
 | |
| 				consultancy_d.extractions
 | |
| 			WHERE
 | |
| 				type == 'price'
 | |
| 			ORDER BY property_id
 | |
| 		""")
 | |
| 
 | |
| 	def price_developement_per_property(self):
 | |
| 		return self.connection.sql("""
 | |
| 			SELECT 
 | |
| 				JSON_EXTRACT(body, '$.content.lowestPrice.valueNightRaw') AS pricePerNight,
 | |
| 				property_id,
 | |
| 				created_at
 | |
| 			FROM 
 | |
| 				consultancy_d.extractions
 | |
| 			WHERE
 | |
| 				type == 'price'
 | |
| 			ORDER BY property_id
 | |
| 		""")
 | |
| 
 | |
| 	def property_base_data(self, id):
 | |
| 		return self.connection.sql(f"""
 | |
| 			SELECT
 | |
| 				p.property_platform_id,
 | |
| 				p.created_at as first_found,
 | |
| 				p.last_found,
 | |
| 				r.name as region_name 
 | |
| 			FROM
 | |
| 				consultancy_d.properties p
 | |
| 			INNER JOIN consultancy_d.seeds s ON s.id = p.seed_id 
 | |
| 			INNER JOIN consultancy_d.regions r ON s.region_id = r.id
 | |
| 			WHERE
 | |
| 				p.id = {id}
 | |
| 			""")
 | |
| 
 | |
| 	def properties_geo(self):
 | |
| 		return self.connection.sql("""
 | |
| 			SELECT
 | |
| 				p.id,
 | |
| 				p.check_data as coordinates
 | |
| 			FROM
 | |
| 				consultancy_d.properties p
 | |
| 		""")
 | |
| 
 | |
| 	def capacity_of_region(self, region_id):
 | |
| 		return self.connection.sql(f"""
 | |
| 			SELECT 
 | |
| 				JSON_EXTRACT(body, '$.content.days') as calendarBody, 
 | |
| 				strftime(extractions.created_at, '%Y-%m-%d') AS ScrapeDate,
 | |
| 				extractions.property_id, 
 | |
| 			FROM 
 | |
| 				consultancy_d.extractions 
 | |
| 			LEFT JOIN 
 | |
| 				consultancy_d.properties ON properties.id = extractions.property_id 
 | |
| 			WHERE
 | |
| 				type == 'calendar' AND
 | |
| 				properties.seed_id = {region_id}		
 | |
| 			""")
 | |
| 
 | |
| 
 | |
| 
 | 
