data_quality.py erstellt zur Visualisierung der Datenqualität
This commit is contained in:
		
							parent
							
								
									338d3e9cc2
								
							
						
					
					
						commit
						125250a665
					
				@ -221,6 +221,7 @@ class Database:
 | 
			
		||||
				property_id
 | 
			
		||||
		""")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
	def extractions_for(self, property_id):
 | 
			
		||||
		return self.connection.sql(f"""
 | 
			
		||||
			SELECT 
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										83
									
								
								src/mauro/data_quality.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										83
									
								
								src/mauro/data_quality.py
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,83 @@
 | 
			
		||||
import data
 | 
			
		||||
import json
 | 
			
		||||
import polars as pl
 | 
			
		||||
from datetime import datetime
 | 
			
		||||
import matplotlib.pyplot as plt
 | 
			
		||||
import numpy as np
 | 
			
		||||
 | 
			
		||||
'''
 | 
			
		||||
# Get Data from DB
 | 
			
		||||
inst = data.load()
 | 
			
		||||
 | 
			
		||||
df = inst.extractions().pl()
 | 
			
		||||
print(df)
 | 
			
		||||
 | 
			
		||||
counter = 0
 | 
			
		||||
data = []
 | 
			
		||||
for row in df.iter_rows():
 | 
			
		||||
    property_id = row[1]
 | 
			
		||||
    created_at = row[2].date()
 | 
			
		||||
    dict = {'property_id': property_id, 'created_at': created_at}
 | 
			
		||||
 | 
			
		||||
    jsonStr = row[0]
 | 
			
		||||
    if jsonStr:
 | 
			
		||||
        calendarDict = json.loads(jsonStr)
 | 
			
		||||
        for key in calendarDict:
 | 
			
		||||
            dict[key] = calendarDict[key]
 | 
			
		||||
 | 
			
		||||
    data.append(dict)
 | 
			
		||||
 | 
			
		||||
dfNew = pl.from_dicts(data)
 | 
			
		||||
dfNew.write_csv('results/data_quality.csv')
 | 
			
		||||
print(dfNew)
 | 
			
		||||
'''
 | 
			
		||||
 | 
			
		||||
dfNew = pl.read_csv('results/data_quality.csv')
 | 
			
		||||
dfNew = dfNew.with_columns(pl.col("created_at").map_elements(lambda x: datetime.strptime(x, "%Y-%m-%d").date()))
 | 
			
		||||
 | 
			
		||||
# Create Row Means
 | 
			
		||||
dfTemp = dfNew
 | 
			
		||||
# Temporary Remove leading columns but save for later
 | 
			
		||||
prop = dfTemp.get_column('property_id')
 | 
			
		||||
dfTemp = dfTemp.drop('property_id')
 | 
			
		||||
crea = dfTemp.get_column('created_at')
 | 
			
		||||
dfTemp = dfTemp.drop('created_at')
 | 
			
		||||
dfTemp = dfTemp.with_columns(sum=pl.sum_horizontal(dfTemp.columns))
 | 
			
		||||
sumCol = dfTemp.get_column('sum')
 | 
			
		||||
 | 
			
		||||
# Create new DF with only property_id, created_at and sum
 | 
			
		||||
df = pl.DataFrame([prop, crea, sumCol])
 | 
			
		||||
 | 
			
		||||
# Get unique property_ids
 | 
			
		||||
propsIDs = df.unique(subset=["property_id"])
 | 
			
		||||
propsIDs = propsIDs.get_column("property_id").to_list()
 | 
			
		||||
propsIDs.sort()
 | 
			
		||||
 | 
			
		||||
# create Matrix
 | 
			
		||||
matrix = []
 | 
			
		||||
for id in propsIDs:
 | 
			
		||||
    dict = {}
 | 
			
		||||
    temp = df.filter(pl.col("property_id") == id)
 | 
			
		||||
    for row in temp.iter_rows():
 | 
			
		||||
        dict[row[1].strftime('%Y-%m-%d')] = row[2]
 | 
			
		||||
    matrix.append(dict)
 | 
			
		||||
 | 
			
		||||
matrix = pl.DataFrame(matrix)
 | 
			
		||||
dates = matrix.columns
 | 
			
		||||
matrix = matrix.to_numpy()
 | 
			
		||||
 | 
			
		||||
yRange = range(len(dates))
 | 
			
		||||
matrix = matrix.T
 | 
			
		||||
plt.imshow(matrix)
 | 
			
		||||
plt.yticks(yRange[::5], dates[::5])
 | 
			
		||||
 | 
			
		||||
# Create DiffMatrix
 | 
			
		||||
diffMatrix = np.zeros((len(matrix)-1, len(matrix[0])))
 | 
			
		||||
for y in range(len(matrix[0])):
 | 
			
		||||
    for x in range(len(matrix)-1):
 | 
			
		||||
        diffMatrix[x][y] = abs(matrix[x][y] - matrix[x+1][y])
 | 
			
		||||
 | 
			
		||||
plt.figure()
 | 
			
		||||
plt.imshow(diffMatrix)
 | 
			
		||||
plt.yticks(yRange[::5], dates[::5])
 | 
			
		||||
plt.show()
 | 
			
		||||
							
								
								
									
										58186
									
								
								src/mauro/results/data_quality.csv
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										58186
									
								
								src/mauro/results/data_quality.csv
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because one or more lines are too long
											
										
									
								
							@ -103,13 +103,13 @@ print("Global meanPreorderTime = ",round(merge.get_column("meanPreorderScrapeNum
 | 
			
		||||
 | 
			
		||||
# 1 = Heidiland
 | 
			
		||||
heidi = merge.filter(pl.col("seed_id") == 1)
 | 
			
		||||
print("Heidiland meanPreorderTime = ",round(heidi.get_column("meanPreorderScrapeNum").mean()*3,2))
 | 
			
		||||
print("Heidiland meanPreorderTime = ", round(heidi.get_column("meanPreorderScrapeNum").mean()*3,2))
 | 
			
		||||
# 2 = Davos
 | 
			
		||||
Davos = merge.filter(pl.col("seed_id") == 2)
 | 
			
		||||
print("Davos meanPreorderTime = ",round(Davos.get_column("meanPreorderScrapeNum").mean()*3,2))
 | 
			
		||||
print("Davos meanPreorderTime = ", (Davos.get_column("meanPreorderScrapeNum").mean()*3,2))
 | 
			
		||||
# 3 = Engadin
 | 
			
		||||
Engadin = merge.filter(pl.col("seed_id") == 3)
 | 
			
		||||
print("Engadin meanPreorderTime = ",round(Engadin.get_column("meanPreorderScrapeNum").mean()*3,2))
 | 
			
		||||
print("Engadin meanPreorderTime = ", round(Engadin.get_column("meanPreorderScrapeNum").mean()*3,2))
 | 
			
		||||
# 4 = St. Moritz
 | 
			
		||||
Moritz = merge.filter(pl.col("seed_id") == 4)
 | 
			
		||||
print("St. Moritz meanPreorderTime = ",round(Moritz.get_column("meanPreorderScrapeNum").mean()*3,2))
 | 
			
		||||
print("St. Moritz meanPreorderTime = ", round(Moritz.get_column("meanPreorderScrapeNum").mean()*3,2))
 | 
			
		||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user