Initial commit
Some checks failed
CI/CD Pipeline / Code Quality & Linting (push) Has been cancelled
CI/CD Pipeline / Policy Validation (push) Has been cancelled
CI/CD Pipeline / Test Suite (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-firm-connectors) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-forms) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-hmrc) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ingestion) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-normalize-map) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ocr) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-indexer) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-reason) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rpa) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (ui-review) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (ui-review) (push) Has been cancelled
CI/CD Pipeline / Generate SBOM (push) Has been cancelled
CI/CD Pipeline / Deploy to Staging (push) Has been cancelled
CI/CD Pipeline / Deploy to Production (push) Has been cancelled
CI/CD Pipeline / Notifications (push) Has been cancelled
Some checks failed
CI/CD Pipeline / Code Quality & Linting (push) Has been cancelled
CI/CD Pipeline / Policy Validation (push) Has been cancelled
CI/CD Pipeline / Test Suite (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-firm-connectors) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-forms) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-hmrc) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ingestion) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-normalize-map) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ocr) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-indexer) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-reason) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rpa) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (ui-review) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (ui-review) (push) Has been cancelled
CI/CD Pipeline / Generate SBOM (push) Has been cancelled
CI/CD Pipeline / Deploy to Staging (push) Has been cancelled
CI/CD Pipeline / Deploy to Production (push) Has been cancelled
CI/CD Pipeline / Notifications (push) Has been cancelled
This commit is contained in:
144
libs/calibration/metrics.py
Normal file
144
libs/calibration/metrics.py
Normal file
@@ -0,0 +1,144 @@
|
||||
"""Calibration metrics for evaluating confidence calibration."""
|
||||
|
||||
import numpy as np
|
||||
|
||||
|
||||
class ConfidenceMetrics:
|
||||
"""Calculate calibration metrics"""
|
||||
|
||||
@staticmethod
|
||||
def expected_calibration_error(
|
||||
scores: list[float], labels: list[bool], n_bins: int = 10
|
||||
) -> float:
|
||||
"""
|
||||
Calculate Expected Calibration Error (ECE)
|
||||
|
||||
Args:
|
||||
scores: Predicted confidence scores
|
||||
labels: True labels
|
||||
n_bins: Number of bins for calibration
|
||||
|
||||
Returns:
|
||||
ECE value
|
||||
"""
|
||||
scores_array = np.array(scores)
|
||||
labels_array = np.array(labels, dtype=int)
|
||||
|
||||
bin_boundaries = np.linspace(0, 1, n_bins + 1)
|
||||
bin_lowers = bin_boundaries[:-1]
|
||||
bin_uppers = bin_boundaries[1:]
|
||||
|
||||
ece = 0
|
||||
|
||||
for bin_lower, bin_upper in zip(bin_lowers, bin_uppers, strict=False):
|
||||
# Find samples in this bin
|
||||
in_bin = (scores_array > bin_lower) & (scores_array <= bin_upper)
|
||||
prop_in_bin = in_bin.mean()
|
||||
|
||||
if prop_in_bin > 0:
|
||||
# Calculate accuracy and confidence in this bin
|
||||
accuracy_in_bin = labels_array[in_bin].mean()
|
||||
avg_confidence_in_bin = scores_array[in_bin].mean()
|
||||
|
||||
# Add to ECE
|
||||
ece += np.abs(avg_confidence_in_bin - accuracy_in_bin) * prop_in_bin
|
||||
|
||||
return ece
|
||||
|
||||
@staticmethod
|
||||
def maximum_calibration_error(
|
||||
scores: list[float], labels: list[bool], n_bins: int = 10
|
||||
) -> float:
|
||||
"""
|
||||
Calculate Maximum Calibration Error (MCE)
|
||||
|
||||
Args:
|
||||
scores: Predicted confidence scores
|
||||
labels: True labels
|
||||
n_bins: Number of bins for calibration
|
||||
|
||||
Returns:
|
||||
MCE value
|
||||
"""
|
||||
scores_array = np.array(scores)
|
||||
labels_array = np.array(labels, dtype=int)
|
||||
|
||||
bin_boundaries = np.linspace(0, 1, n_bins + 1)
|
||||
bin_lowers = bin_boundaries[:-1]
|
||||
bin_uppers = bin_boundaries[1:]
|
||||
|
||||
max_error = 0
|
||||
|
||||
for bin_lower, bin_upper in zip(bin_lowers, bin_uppers, strict=False):
|
||||
# Find samples in this bin
|
||||
in_bin = (scores_array > bin_lower) & (scores_array <= bin_upper)
|
||||
|
||||
if in_bin.sum() > 0:
|
||||
# Calculate accuracy and confidence in this bin
|
||||
accuracy_in_bin = labels_array[in_bin].mean()
|
||||
avg_confidence_in_bin = scores_array[in_bin].mean()
|
||||
|
||||
# Update maximum error
|
||||
error = np.abs(avg_confidence_in_bin - accuracy_in_bin)
|
||||
max_error = max(max_error, error)
|
||||
|
||||
return max_error
|
||||
|
||||
@staticmethod
|
||||
def reliability_diagram_data( # pylint: disable=too-many-locals
|
||||
scores: list[float], labels: list[bool], n_bins: int = 10
|
||||
) -> dict[str, list[float]]:
|
||||
"""
|
||||
Generate data for reliability diagram
|
||||
|
||||
Args:
|
||||
scores: Predicted confidence scores
|
||||
labels: True labels
|
||||
n_bins: Number of bins
|
||||
|
||||
Returns:
|
||||
Dictionary with bin data for plotting
|
||||
"""
|
||||
scores_array = np.array(scores)
|
||||
labels_array = np.array(labels, dtype=int)
|
||||
|
||||
bin_boundaries = np.linspace(0, 1, n_bins + 1)
|
||||
bin_lowers = bin_boundaries[:-1]
|
||||
bin_uppers = bin_boundaries[1:]
|
||||
|
||||
bin_centers = []
|
||||
bin_accuracies = []
|
||||
bin_confidences = []
|
||||
bin_counts = []
|
||||
|
||||
for bin_lower, bin_upper in zip(bin_lowers, bin_uppers, strict=False):
|
||||
# Find samples in this bin
|
||||
in_bin = (scores_array > bin_lower) & (scores_array <= bin_upper)
|
||||
bin_count = in_bin.sum()
|
||||
|
||||
if bin_count > 0:
|
||||
bin_center = (bin_lower + bin_upper) / 2
|
||||
accuracy_in_bin = labels_array[in_bin].mean()
|
||||
avg_confidence_in_bin = scores_array[in_bin].mean()
|
||||
|
||||
bin_centers.append(bin_center)
|
||||
bin_accuracies.append(accuracy_in_bin)
|
||||
bin_confidences.append(avg_confidence_in_bin)
|
||||
bin_counts.append(bin_count)
|
||||
|
||||
return {
|
||||
"bin_centers": bin_centers,
|
||||
"bin_accuracies": bin_accuracies,
|
||||
"bin_confidences": bin_confidences,
|
||||
"bin_counts": bin_counts,
|
||||
}
|
||||
|
||||
|
||||
# Default calibrators for common tasks
|
||||
DEFAULT_CALIBRATORS = {
|
||||
"ocr_confidence": {"method": "temperature"},
|
||||
"extraction_confidence": {"method": "platt"},
|
||||
"rag_confidence": {"method": "isotonic"},
|
||||
"calculation_confidence": {"method": "temperature"},
|
||||
"overall_confidence": {"method": "platt"},
|
||||
}
|
||||
Reference in New Issue
Block a user