216 lines
7.2 KiB
Python
216 lines
7.2 KiB
Python
"""
|
||
analyze_me.py – A data-processing script used in Exercise 2
|
||
==============================================================
|
||
This file contains several realistic bugs and style issues.
|
||
Do NOT fix them manually — in Exercise 2 the LLM will help you find them!
|
||
|
||
Can you spot the issues yourself before asking the LLM?
|
||
"""
|
||
|
||
|
||
def calculate_statistics(numbers: list[float]) -> dict[str, float]:
|
||
"""
|
||
Calculate basic statistical measures for a list of numbers.
|
||
|
||
This function computes the count, sum, average, minimum, maximum, and
|
||
variance (population variance) of the provided list of numbers.
|
||
|
||
Parameters
|
||
----------
|
||
numbers : list[float]
|
||
A list of numeric values to analyze.
|
||
|
||
Returns
|
||
-------
|
||
dict[str, float]
|
||
A dictionary containing the following keys:
|
||
- 'count': The number of elements in the list.
|
||
- 'sum': The sum of all elements.
|
||
- 'average': The arithmetic mean of the elements.
|
||
- 'min': The minimum value in the list.
|
||
- 'max': The maximum value in the list.
|
||
- 'variance': The population variance of the elements.
|
||
|
||
Raises
|
||
------
|
||
ZeroDivisionError
|
||
If the input list is empty, division by zero will occur when
|
||
calculating the average and variance.
|
||
IndexError
|
||
If the input list is empty, accessing the first element for min/max
|
||
will raise an error.
|
||
"""
|
||
# Step 2 – Implement empty list handling in calculate_statistics
|
||
if not numbers:
|
||
return {
|
||
"count": 0,
|
||
"sum": 0.0,
|
||
"average": 0.0,
|
||
"min": 0.0,
|
||
"max": 0.0,
|
||
"variance": 0.0,
|
||
}
|
||
|
||
total = 0
|
||
for n in numbers:
|
||
total = total + n
|
||
average = total / len(numbers) # Bug 1: ZeroDivisionError when list is empty
|
||
|
||
min_val = numbers[0] # Bug 2: IndexError when list is empty
|
||
max_val = numbers[0]
|
||
for n in numbers:
|
||
if n < min_val:
|
||
min_val = n
|
||
if n > max_val:
|
||
max_val = n
|
||
|
||
variance = 0
|
||
for n in numbers:
|
||
variance = variance + (n - average) ** 2
|
||
|
||
# Step 3 – Correct variance calculation to use sample variance
|
||
count = len(numbers)
|
||
if count > 1:
|
||
variance = variance / (count - 1)
|
||
else:
|
||
variance = 0.0
|
||
|
||
return {
|
||
"count": len(numbers),
|
||
"sum": total,
|
||
"average": average,
|
||
"min": min_val,
|
||
"max": max_val,
|
||
"variance": variance,
|
||
}
|
||
|
||
|
||
# Step 4 – Define type hints and docstrings for process_data
|
||
def process_data(filename: str) -> dict[str, float]:
|
||
"""
|
||
Read numeric data from a file and compute statistics.
|
||
|
||
This function opens a text file, reads each line, converts it to an integer,
|
||
and collects the values into a list. It then passes this list to
|
||
calculate_statistics to compute and return the statistical summary.
|
||
|
||
Parameters
|
||
----------
|
||
filename : str
|
||
The path to the text file containing one number per line.
|
||
|
||
Returns
|
||
-------
|
||
dict[str, float]
|
||
A dictionary containing the statistical measures computed from the file data.
|
||
|
||
Raises
|
||
------
|
||
FileNotFoundError
|
||
If the specified file does not exist.
|
||
ValueError
|
||
If a line in the file cannot be converted to an integer.
|
||
"""
|
||
numbers = []
|
||
# Step 5 – Implement context manager and robust line parsing in process_data
|
||
with open(filename) as f:
|
||
for line in f:
|
||
stripped = line.strip()
|
||
if not stripped:
|
||
continue
|
||
try:
|
||
# Attempt to convert to float first to handle both ints and floats
|
||
value = float(stripped)
|
||
numbers.append(value)
|
||
except ValueError:
|
||
# Skip lines that cannot be converted to a number
|
||
continue
|
||
|
||
result = calculate_statistics(numbers)
|
||
print("Statistics:", result)
|
||
return result
|
||
|
||
|
||
# Step 6 – Define type hints and docstrings for normalize
|
||
def normalize(numbers: list[float], method: str = "minmax") -> list[float]:
|
||
"""
|
||
Normalize a list of numbers using the specified method.
|
||
|
||
This function applies either 'minmax' scaling or 'zscore' standardization
|
||
to the input list of numbers.
|
||
|
||
Parameters
|
||
----------
|
||
numbers : list[float]
|
||
A list of numeric values to normalize.
|
||
method : str, optional
|
||
The normalization method to use. Options are:
|
||
- 'minmax': Scales values to the range [0, 1].
|
||
- 'zscore': Standardizes values to have mean 0 and standard deviation 1.
|
||
Default is 'minmax'.
|
||
|
||
Returns
|
||
-------
|
||
list[float]
|
||
A list of normalized values.
|
||
|
||
Raises
|
||
------
|
||
ValueError
|
||
If an unknown normalization method is provided.
|
||
ZeroDivisionError
|
||
If 'minmax' is used on a list where all values are identical (range is 0),
|
||
or if 'zscore' is used on a list with zero standard deviation.
|
||
|
||
Examples
|
||
--------
|
||
>>> normalize([1, 2, 3, 4, 5])
|
||
[0.0, 0.25, 0.5, 0.75, 1.0]
|
||
"""
|
||
if method == "minmax":
|
||
mn = min(numbers)
|
||
mx = max(numbers)
|
||
# Step 7 – Fix operator precedence bug in minmax normalization
|
||
return [(x - mn) / (mx - mn) for x in numbers]
|
||
elif method == "zscore":
|
||
stats = calculate_statistics(numbers)
|
||
std = stats["variance"] ** 0.5
|
||
return [(x - stats["average"]) / std for x in numbers]
|
||
else:
|
||
# Step 8 – Replace print statement with ValueError for unknown methods
|
||
raise ValueError(f"Unknown normalisation method: {method}")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
# Step 9 – Implement and verify main block sanity checks
|
||
sample = [4, 8, 15, 16, 23, 42]
|
||
stats = calculate_statistics(sample)
|
||
|
||
# Verify expected values for sample data
|
||
expected_sum = 4 + 8 + 15 + 16 + 23 + 42
|
||
expected_count = 6
|
||
expected_avg = expected_sum / expected_count
|
||
|
||
assert stats["count"] == expected_count, f"Count mismatch: {stats['count']} != {expected_count}"
|
||
assert stats["sum"] == expected_sum, f"Sum mismatch: {stats['sum']} != {expected_sum}"
|
||
assert abs(stats["average"] - expected_avg) < 1e-9, f"Average mismatch: {stats['average']} != {expected_avg}"
|
||
assert stats["min"] == 4, f"Min mismatch: {stats['min']} != 4"
|
||
assert stats["max"] == 42, f"Max mismatch: {stats['max']} != 42"
|
||
|
||
# Test empty list handling
|
||
empty_stats = calculate_statistics([])
|
||
assert empty_stats["count"] == 0, "Empty list count should be 0"
|
||
assert empty_stats["sum"] == 0.0, "Empty list sum should be 0.0"
|
||
assert empty_stats["average"] == 0.0, "Empty list average should be 0.0"
|
||
assert empty_stats["min"] == 0.0, "Empty list min should be 0.0"
|
||
assert empty_stats["max"] == 0.0, "Empty list max should be 0.0"
|
||
assert empty_stats["variance"] == 0.0, "Empty list variance should be 0.0"
|
||
|
||
# Test normalization
|
||
normalized = normalize([1, 2, 3, 4, 5])
|
||
expected_normalized = [0.0, 0.25, 0.5, 0.75, 1.0]
|
||
assert len(normalized) == 5, "Normalized list length mismatch"
|
||
for i, val in enumerate(normalized):
|
||
assert abs(val - expected_normalized[i]) < 1e-9, f"Normalized value mismatch at index {i}"
|
||
|
||
print("All sanity checks passed!") |