Some checks failed
CI/CD Pipeline / Code Quality & Linting (push) Has been cancelled
CI/CD Pipeline / Policy Validation (push) Has been cancelled
CI/CD Pipeline / Test Suite (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-firm-connectors) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-forms) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-hmrc) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ingestion) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-normalize-map) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ocr) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-indexer) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-reason) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rpa) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (ui-review) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (ui-review) (push) Has been cancelled
CI/CD Pipeline / Generate SBOM (push) Has been cancelled
CI/CD Pipeline / Deploy to Staging (push) Has been cancelled
CI/CD Pipeline / Deploy to Production (push) Has been cancelled
CI/CD Pipeline / Notifications (push) Has been cancelled
145 lines
4.5 KiB
Python
145 lines
4.5 KiB
Python
"""Calibration metrics for evaluating confidence calibration."""
|
|
|
|
import numpy as np
|
|
|
|
|
|
class ConfidenceMetrics:
|
|
"""Calculate calibration metrics"""
|
|
|
|
@staticmethod
|
|
def expected_calibration_error(
|
|
scores: list[float], labels: list[bool], n_bins: int = 10
|
|
) -> float:
|
|
"""
|
|
Calculate Expected Calibration Error (ECE)
|
|
|
|
Args:
|
|
scores: Predicted confidence scores
|
|
labels: True labels
|
|
n_bins: Number of bins for calibration
|
|
|
|
Returns:
|
|
ECE value
|
|
"""
|
|
scores_array = np.array(scores)
|
|
labels_array = np.array(labels, dtype=int)
|
|
|
|
bin_boundaries = np.linspace(0, 1, n_bins + 1)
|
|
bin_lowers = bin_boundaries[:-1]
|
|
bin_uppers = bin_boundaries[1:]
|
|
|
|
ece = 0
|
|
|
|
for bin_lower, bin_upper in zip(bin_lowers, bin_uppers, strict=False):
|
|
# Find samples in this bin
|
|
in_bin = (scores_array > bin_lower) & (scores_array <= bin_upper)
|
|
prop_in_bin = in_bin.mean()
|
|
|
|
if prop_in_bin > 0:
|
|
# Calculate accuracy and confidence in this bin
|
|
accuracy_in_bin = labels_array[in_bin].mean()
|
|
avg_confidence_in_bin = scores_array[in_bin].mean()
|
|
|
|
# Add to ECE
|
|
ece += np.abs(avg_confidence_in_bin - accuracy_in_bin) * prop_in_bin
|
|
|
|
return ece
|
|
|
|
@staticmethod
|
|
def maximum_calibration_error(
|
|
scores: list[float], labels: list[bool], n_bins: int = 10
|
|
) -> float:
|
|
"""
|
|
Calculate Maximum Calibration Error (MCE)
|
|
|
|
Args:
|
|
scores: Predicted confidence scores
|
|
labels: True labels
|
|
n_bins: Number of bins for calibration
|
|
|
|
Returns:
|
|
MCE value
|
|
"""
|
|
scores_array = np.array(scores)
|
|
labels_array = np.array(labels, dtype=int)
|
|
|
|
bin_boundaries = np.linspace(0, 1, n_bins + 1)
|
|
bin_lowers = bin_boundaries[:-1]
|
|
bin_uppers = bin_boundaries[1:]
|
|
|
|
max_error = 0
|
|
|
|
for bin_lower, bin_upper in zip(bin_lowers, bin_uppers, strict=False):
|
|
# Find samples in this bin
|
|
in_bin = (scores_array > bin_lower) & (scores_array <= bin_upper)
|
|
|
|
if in_bin.sum() > 0:
|
|
# Calculate accuracy and confidence in this bin
|
|
accuracy_in_bin = labels_array[in_bin].mean()
|
|
avg_confidence_in_bin = scores_array[in_bin].mean()
|
|
|
|
# Update maximum error
|
|
error = np.abs(avg_confidence_in_bin - accuracy_in_bin)
|
|
max_error = max(max_error, error)
|
|
|
|
return max_error
|
|
|
|
@staticmethod
|
|
def reliability_diagram_data( # pylint: disable=too-many-locals
|
|
scores: list[float], labels: list[bool], n_bins: int = 10
|
|
) -> dict[str, list[float]]:
|
|
"""
|
|
Generate data for reliability diagram
|
|
|
|
Args:
|
|
scores: Predicted confidence scores
|
|
labels: True labels
|
|
n_bins: Number of bins
|
|
|
|
Returns:
|
|
Dictionary with bin data for plotting
|
|
"""
|
|
scores_array = np.array(scores)
|
|
labels_array = np.array(labels, dtype=int)
|
|
|
|
bin_boundaries = np.linspace(0, 1, n_bins + 1)
|
|
bin_lowers = bin_boundaries[:-1]
|
|
bin_uppers = bin_boundaries[1:]
|
|
|
|
bin_centers = []
|
|
bin_accuracies = []
|
|
bin_confidences = []
|
|
bin_counts = []
|
|
|
|
for bin_lower, bin_upper in zip(bin_lowers, bin_uppers, strict=False):
|
|
# Find samples in this bin
|
|
in_bin = (scores_array > bin_lower) & (scores_array <= bin_upper)
|
|
bin_count = in_bin.sum()
|
|
|
|
if bin_count > 0:
|
|
bin_center = (bin_lower + bin_upper) / 2
|
|
accuracy_in_bin = labels_array[in_bin].mean()
|
|
avg_confidence_in_bin = scores_array[in_bin].mean()
|
|
|
|
bin_centers.append(bin_center)
|
|
bin_accuracies.append(accuracy_in_bin)
|
|
bin_confidences.append(avg_confidence_in_bin)
|
|
bin_counts.append(bin_count)
|
|
|
|
return {
|
|
"bin_centers": bin_centers,
|
|
"bin_accuracies": bin_accuracies,
|
|
"bin_confidences": bin_confidences,
|
|
"bin_counts": bin_counts,
|
|
}
|
|
|
|
|
|
# Default calibrators for common tasks
|
|
DEFAULT_CALIBRATORS = {
|
|
"ocr_confidence": {"method": "temperature"},
|
|
"extraction_confidence": {"method": "platt"},
|
|
"rag_confidence": {"method": "isotonic"},
|
|
"calculation_confidence": {"method": "temperature"},
|
|
"overall_confidence": {"method": "platt"},
|
|
}
|