Files
ai-tax-agent/libs/calibration/metrics.py
harkon b324ff09ef
Some checks failed
CI/CD Pipeline / Code Quality & Linting (push) Has been cancelled
CI/CD Pipeline / Policy Validation (push) Has been cancelled
CI/CD Pipeline / Test Suite (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-firm-connectors) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-forms) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-hmrc) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ingestion) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-normalize-map) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ocr) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-indexer) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-reason) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rpa) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (ui-review) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (ui-review) (push) Has been cancelled
CI/CD Pipeline / Generate SBOM (push) Has been cancelled
CI/CD Pipeline / Deploy to Staging (push) Has been cancelled
CI/CD Pipeline / Deploy to Production (push) Has been cancelled
CI/CD Pipeline / Notifications (push) Has been cancelled
Initial commit
2025-10-11 08:41:36 +01:00

145 lines
4.5 KiB
Python

"""Calibration metrics for evaluating confidence calibration."""
import numpy as np
class ConfidenceMetrics:
"""Calculate calibration metrics"""
@staticmethod
def expected_calibration_error(
scores: list[float], labels: list[bool], n_bins: int = 10
) -> float:
"""
Calculate Expected Calibration Error (ECE)
Args:
scores: Predicted confidence scores
labels: True labels
n_bins: Number of bins for calibration
Returns:
ECE value
"""
scores_array = np.array(scores)
labels_array = np.array(labels, dtype=int)
bin_boundaries = np.linspace(0, 1, n_bins + 1)
bin_lowers = bin_boundaries[:-1]
bin_uppers = bin_boundaries[1:]
ece = 0
for bin_lower, bin_upper in zip(bin_lowers, bin_uppers, strict=False):
# Find samples in this bin
in_bin = (scores_array > bin_lower) & (scores_array <= bin_upper)
prop_in_bin = in_bin.mean()
if prop_in_bin > 0:
# Calculate accuracy and confidence in this bin
accuracy_in_bin = labels_array[in_bin].mean()
avg_confidence_in_bin = scores_array[in_bin].mean()
# Add to ECE
ece += np.abs(avg_confidence_in_bin - accuracy_in_bin) * prop_in_bin
return ece
@staticmethod
def maximum_calibration_error(
scores: list[float], labels: list[bool], n_bins: int = 10
) -> float:
"""
Calculate Maximum Calibration Error (MCE)
Args:
scores: Predicted confidence scores
labels: True labels
n_bins: Number of bins for calibration
Returns:
MCE value
"""
scores_array = np.array(scores)
labels_array = np.array(labels, dtype=int)
bin_boundaries = np.linspace(0, 1, n_bins + 1)
bin_lowers = bin_boundaries[:-1]
bin_uppers = bin_boundaries[1:]
max_error = 0
for bin_lower, bin_upper in zip(bin_lowers, bin_uppers, strict=False):
# Find samples in this bin
in_bin = (scores_array > bin_lower) & (scores_array <= bin_upper)
if in_bin.sum() > 0:
# Calculate accuracy and confidence in this bin
accuracy_in_bin = labels_array[in_bin].mean()
avg_confidence_in_bin = scores_array[in_bin].mean()
# Update maximum error
error = np.abs(avg_confidence_in_bin - accuracy_in_bin)
max_error = max(max_error, error)
return max_error
@staticmethod
def reliability_diagram_data( # pylint: disable=too-many-locals
scores: list[float], labels: list[bool], n_bins: int = 10
) -> dict[str, list[float]]:
"""
Generate data for reliability diagram
Args:
scores: Predicted confidence scores
labels: True labels
n_bins: Number of bins
Returns:
Dictionary with bin data for plotting
"""
scores_array = np.array(scores)
labels_array = np.array(labels, dtype=int)
bin_boundaries = np.linspace(0, 1, n_bins + 1)
bin_lowers = bin_boundaries[:-1]
bin_uppers = bin_boundaries[1:]
bin_centers = []
bin_accuracies = []
bin_confidences = []
bin_counts = []
for bin_lower, bin_upper in zip(bin_lowers, bin_uppers, strict=False):
# Find samples in this bin
in_bin = (scores_array > bin_lower) & (scores_array <= bin_upper)
bin_count = in_bin.sum()
if bin_count > 0:
bin_center = (bin_lower + bin_upper) / 2
accuracy_in_bin = labels_array[in_bin].mean()
avg_confidence_in_bin = scores_array[in_bin].mean()
bin_centers.append(bin_center)
bin_accuracies.append(accuracy_in_bin)
bin_confidences.append(avg_confidence_in_bin)
bin_counts.append(bin_count)
return {
"bin_centers": bin_centers,
"bin_accuracies": bin_accuracies,
"bin_confidences": bin_confidences,
"bin_counts": bin_counts,
}
# Default calibrators for common tasks
DEFAULT_CALIBRATORS = {
"ocr_confidence": {"method": "temperature"},
"extraction_confidence": {"method": "platt"},
"rag_confidence": {"method": "isotonic"},
"calculation_confidence": {"method": "temperature"},
"overall_confidence": {"method": "platt"},
}