"""Calibration metrics for evaluating confidence calibration.""" import numpy as np class ConfidenceMetrics: """Calculate calibration metrics""" @staticmethod def expected_calibration_error( scores: list[float], labels: list[bool], n_bins: int = 10 ) -> float: """ Calculate Expected Calibration Error (ECE) Args: scores: Predicted confidence scores labels: True labels n_bins: Number of bins for calibration Returns: ECE value """ scores_array = np.array(scores) labels_array = np.array(labels, dtype=int) bin_boundaries = np.linspace(0, 1, n_bins + 1) bin_lowers = bin_boundaries[:-1] bin_uppers = bin_boundaries[1:] ece = 0 for bin_lower, bin_upper in zip(bin_lowers, bin_uppers, strict=False): # Find samples in this bin in_bin = (scores_array > bin_lower) & (scores_array <= bin_upper) prop_in_bin = in_bin.mean() if prop_in_bin > 0: # Calculate accuracy and confidence in this bin accuracy_in_bin = labels_array[in_bin].mean() avg_confidence_in_bin = scores_array[in_bin].mean() # Add to ECE ece += np.abs(avg_confidence_in_bin - accuracy_in_bin) * prop_in_bin return ece @staticmethod def maximum_calibration_error( scores: list[float], labels: list[bool], n_bins: int = 10 ) -> float: """ Calculate Maximum Calibration Error (MCE) Args: scores: Predicted confidence scores labels: True labels n_bins: Number of bins for calibration Returns: MCE value """ scores_array = np.array(scores) labels_array = np.array(labels, dtype=int) bin_boundaries = np.linspace(0, 1, n_bins + 1) bin_lowers = bin_boundaries[:-1] bin_uppers = bin_boundaries[1:] max_error = 0 for bin_lower, bin_upper in zip(bin_lowers, bin_uppers, strict=False): # Find samples in this bin in_bin = (scores_array > bin_lower) & (scores_array <= bin_upper) if in_bin.sum() > 0: # Calculate accuracy and confidence in this bin accuracy_in_bin = labels_array[in_bin].mean() avg_confidence_in_bin = scores_array[in_bin].mean() # Update maximum error error = np.abs(avg_confidence_in_bin - accuracy_in_bin) max_error = max(max_error, error) return max_error @staticmethod def reliability_diagram_data( # pylint: disable=too-many-locals scores: list[float], labels: list[bool], n_bins: int = 10 ) -> dict[str, list[float]]: """ Generate data for reliability diagram Args: scores: Predicted confidence scores labels: True labels n_bins: Number of bins Returns: Dictionary with bin data for plotting """ scores_array = np.array(scores) labels_array = np.array(labels, dtype=int) bin_boundaries = np.linspace(0, 1, n_bins + 1) bin_lowers = bin_boundaries[:-1] bin_uppers = bin_boundaries[1:] bin_centers = [] bin_accuracies = [] bin_confidences = [] bin_counts = [] for bin_lower, bin_upper in zip(bin_lowers, bin_uppers, strict=False): # Find samples in this bin in_bin = (scores_array > bin_lower) & (scores_array <= bin_upper) bin_count = in_bin.sum() if bin_count > 0: bin_center = (bin_lower + bin_upper) / 2 accuracy_in_bin = labels_array[in_bin].mean() avg_confidence_in_bin = scores_array[in_bin].mean() bin_centers.append(bin_center) bin_accuracies.append(accuracy_in_bin) bin_confidences.append(avg_confidence_in_bin) bin_counts.append(bin_count) return { "bin_centers": bin_centers, "bin_accuracies": bin_accuracies, "bin_confidences": bin_confidences, "bin_counts": bin_counts, } # Default calibrators for common tasks DEFAULT_CALIBRATORS = { "ocr_confidence": {"method": "temperature"}, "extraction_confidence": {"method": "platt"}, "rag_confidence": {"method": "isotonic"}, "calculation_confidence": {"method": "temperature"}, "overall_confidence": {"method": "platt"}, }