Initial commit

2025-10-11 08:41:36 +01:00
commit b324ff09ef
276 changed files with 55220 additions and 0 deletions
--- a/libs/calibration/init.py
+++ b/libs/calibration/init.py
@@ -0,0 +1,12 @@
+"""Confidence calibration for ML models."""
+
+from .calibrator import ConfidenceCalibrator
+from .metrics import DEFAULT_CALIBRATORS, ConfidenceMetrics
+from .multi_model import MultiModelCalibrator
+
+__all__ = [
+    "ConfidenceCalibrator",
+    "MultiModelCalibrator",
+    "ConfidenceMetrics",
+    "DEFAULT_CALIBRATORS",
+]
--- a/libs/calibration/calibrator.py
+++ b/libs/calibration/calibrator.py
@@ -0,0 +1,190 @@
+"""Confidence calibrator using various methods."""
+
+import pickle
+
+import numpy as np
+import structlog
+from sklearn.isotonic import IsotonicRegression
+from sklearn.linear_model import LogisticRegression
+
+logger = structlog.get_logger()
+
+
+class ConfidenceCalibrator:
+    """Calibrate confidence scores using various methods"""
+
+    def __init__(self, method: str = "temperature"):
+        """
+        Initialize calibrator
+
+        Args:
+            method: Calibration method ('temperature', 'platt', 'isotonic')
+        """
+        self.method = method
+        self.calibrator = None
+        self.temperature = 1.0
+        self.is_fitted = False
+
+    def fit(self, scores: list[float], labels: list[bool]) -> None:
+        """
+        Fit calibration model
+
+        Args:
+            scores: Raw confidence scores (0-1)
+            labels: True labels (True/False for correct/incorrect)
+        """
+        # Validate inputs
+        if len(scores) == 0 or len(labels) == 0:
+            raise ValueError("Scores and labels cannot be empty")
+
+        if len(scores) != len(labels):
+            raise ValueError("Scores and labels must have the same length")
+
+        scores_array = np.array(scores).reshape(-1, 1)
+        labels_array = np.array(labels, dtype=int)
+
+        if self.method == "temperature":
+            self._fit_temperature_scaling(scores_array, labels_array)
+        elif self.method == "platt":
+            self._fit_platt_scaling(scores_array, labels_array)
+        elif self.method == "isotonic":
+            self._fit_isotonic_regression(scores_array, labels_array)
+        else:
+            raise ValueError(f"Unknown calibration method: {self.method}")
+
+        self.is_fitted = True
+        logger.info("Calibrator fitted", method=self.method)
+
+    def _fit_temperature_scaling(self, scores: np.ndarray, labels: np.ndarray) -> None:
+        """Fit temperature scaling parameter"""
+        # pylint: disable=import-outside-toplevel
+        from scipy.optimize import minimize_scalar
+
+        def negative_log_likelihood(temperature: float) -> float:
+            # Convert scores to logits
+            epsilon = 1e-7
+            scores_clipped = np.clip(scores.flatten(), epsilon, 1 - epsilon)
+            logits = np.log(scores_clipped / (1 - scores_clipped))
+
+            # Apply temperature scaling
+            calibrated_logits = logits / temperature
+            calibrated_probs = 1 / (1 + np.exp(-calibrated_logits))
+
+            # Calculate negative log likelihood
+            nll = -np.mean(
+                labels * np.log(calibrated_probs + epsilon)
+                + (1 - labels) * np.log(1 - calibrated_probs + epsilon)
+            )
+            return float(nll)
+
+        # Find optimal temperature
+        result = minimize_scalar(  # type: ignore
+            negative_log_likelihood,
+            bounds=(0.1, 10.0),
+            method="bounded",  # fmt: skip # pyright: ignore[reportArgumentType]
+        )
+        self.temperature = result.x
+
+        logger.debug("Temperature scaling fitted", temperature=self.temperature)
+
+    def _fit_platt_scaling(self, scores: np.ndarray, labels: np.ndarray) -> None:
+        """Fit Platt scaling (logistic regression)"""
+        # Convert scores to logits
+        epsilon = 1e-7
+        scores_clipped = np.clip(scores.flatten(), epsilon, 1 - epsilon)
+        logits = np.log(scores_clipped / (1 - scores_clipped)).reshape(-1, 1)
+
+        # Fit logistic regression
+        self.calibrator = LogisticRegression()
+        self.calibrator.fit(logits, labels)  # type: ignore
+
+        logger.debug("Platt scaling fitted")
+
+    def _fit_isotonic_regression(self, scores: np.ndarray, labels: np.ndarray) -> None:
+        """Fit isotonic regression"""
+        self.calibrator = IsotonicRegression(out_of_bounds="clip")
+        self.calibrator.fit(scores.flatten(), labels)  # type: ignore
+
+        logger.debug("Isotonic regression fitted")
+
+    def calibrate(self, scores: list[float]) -> list[float]:
+        """
+        Calibrate confidence scores
+
+        Args:
+            scores: Raw confidence scores
+
+        Returns:
+            Calibrated confidence scores
+        """
+        if not self.is_fitted:
+            logger.warning("Calibrator not fitted, returning original scores")
+            return scores
+
+        scores_array = np.array(scores)
+
+        if self.method == "temperature":
+            return self._calibrate_temperature(scores_array)
+        if self.method == "platt":
+            return self._calibrate_platt(scores_array)
+        if self.method == "isotonic":
+            return self._calibrate_isotonic(scores_array)
+        return scores
+
+    def _calibrate_temperature(self, scores: np.ndarray) -> list[float]:
+        """Apply temperature scaling"""
+        epsilon = 1e-7
+        scores_clipped = np.clip(scores, epsilon, 1 - epsilon)
+
+        # Convert to logits
+        logits = np.log(scores_clipped / (1 - scores_clipped))
+
+        # Apply temperature scaling
+        calibrated_logits = logits / self.temperature
+        calibrated_probs = 1 / (1 + np.exp(-calibrated_logits))
+
+        return calibrated_probs.tolist()  # type: ignore
+
+    def _calibrate_platt(self, scores: np.ndarray) -> list[float]:
+        """Apply Platt scaling"""
+        epsilon = 1e-7
+        scores_clipped = np.clip(scores, epsilon, 1 - epsilon)
+
+        # Convert to logits
+        logits = np.log(scores_clipped / (1 - scores_clipped)).reshape(-1, 1)
+
+        # Apply Platt scaling
+        calibrated_probs = self.calibrator.predict_proba(logits)[:, 1]  # type: ignore
+
+        return calibrated_probs.tolist()  # type: ignore
+
+    def _calibrate_isotonic(self, scores: np.ndarray) -> list[float]:
+        """Apply isotonic regression"""
+        calibrated_probs = self.calibrator.predict(scores)  # type: ignore
+        return calibrated_probs.tolist()  # type: ignore
+
+    def save_model(self, filepath: str) -> None:
+        """Save calibration model"""
+        model_data = {
+            "method": self.method,
+            "temperature": self.temperature,
+            "calibrator": self.calibrator,
+            "is_fitted": self.is_fitted,
+        }
+
+        with open(filepath, "wb") as f:
+            pickle.dump(model_data, f)
+
+        logger.info("Calibration model saved", filepath=filepath)
+
+    def load_model(self, filepath: str) -> None:
+        """Load calibration model"""
+        with open(filepath, "rb") as f:
+            model_data = pickle.load(f)
+
+        self.method = model_data["method"]
+        self.temperature = model_data["temperature"]
+        self.calibrator = model_data["calibrator"]
+        self.is_fitted = model_data["is_fitted"]
+
+        logger.info("Calibration model loaded", filepath=filepath, method=self.method)
--- a/libs/calibration/metrics.py
+++ b/libs/calibration/metrics.py
@@ -0,0 +1,144 @@
+"""Calibration metrics for evaluating confidence calibration."""
+
+import numpy as np
+
+
+class ConfidenceMetrics:
+    """Calculate calibration metrics"""
+
+    @staticmethod
+    def expected_calibration_error(
+        scores: list[float], labels: list[bool], n_bins: int = 10
+    ) -> float:
+        """
+        Calculate Expected Calibration Error (ECE)
+
+        Args:
+            scores: Predicted confidence scores
+            labels: True labels
+            n_bins: Number of bins for calibration
+
+        Returns:
+            ECE value
+        """
+        scores_array = np.array(scores)
+        labels_array = np.array(labels, dtype=int)
+
+        bin_boundaries = np.linspace(0, 1, n_bins + 1)
+        bin_lowers = bin_boundaries[:-1]
+        bin_uppers = bin_boundaries[1:]
+
+        ece = 0
+
+        for bin_lower, bin_upper in zip(bin_lowers, bin_uppers, strict=False):
+            # Find samples in this bin
+            in_bin = (scores_array > bin_lower) & (scores_array <= bin_upper)
+            prop_in_bin = in_bin.mean()
+
+            if prop_in_bin > 0:
+                # Calculate accuracy and confidence in this bin
+                accuracy_in_bin = labels_array[in_bin].mean()
+                avg_confidence_in_bin = scores_array[in_bin].mean()
+
+                # Add to ECE
+                ece += np.abs(avg_confidence_in_bin - accuracy_in_bin) * prop_in_bin
+
+        return ece
+
+    @staticmethod
+    def maximum_calibration_error(
+        scores: list[float], labels: list[bool], n_bins: int = 10
+    ) -> float:
+        """
+        Calculate Maximum Calibration Error (MCE)
+
+        Args:
+            scores: Predicted confidence scores
+            labels: True labels
+            n_bins: Number of bins for calibration
+
+        Returns:
+            MCE value
+        """
+        scores_array = np.array(scores)
+        labels_array = np.array(labels, dtype=int)
+
+        bin_boundaries = np.linspace(0, 1, n_bins + 1)
+        bin_lowers = bin_boundaries[:-1]
+        bin_uppers = bin_boundaries[1:]
+
+        max_error = 0
+
+        for bin_lower, bin_upper in zip(bin_lowers, bin_uppers, strict=False):
+            # Find samples in this bin
+            in_bin = (scores_array > bin_lower) & (scores_array <= bin_upper)
+
+            if in_bin.sum() > 0:
+                # Calculate accuracy and confidence in this bin
+                accuracy_in_bin = labels_array[in_bin].mean()
+                avg_confidence_in_bin = scores_array[in_bin].mean()
+
+                # Update maximum error
+                error = np.abs(avg_confidence_in_bin - accuracy_in_bin)
+                max_error = max(max_error, error)
+
+        return max_error
+
+    @staticmethod
+    def reliability_diagram_data(  # pylint: disable=too-many-locals
+        scores: list[float], labels: list[bool], n_bins: int = 10
+    ) -> dict[str, list[float]]:
+        """
+        Generate data for reliability diagram
+
+        Args:
+            scores: Predicted confidence scores
+            labels: True labels
+            n_bins: Number of bins
+
+        Returns:
+            Dictionary with bin data for plotting
+        """
+        scores_array = np.array(scores)
+        labels_array = np.array(labels, dtype=int)
+
+        bin_boundaries = np.linspace(0, 1, n_bins + 1)
+        bin_lowers = bin_boundaries[:-1]
+        bin_uppers = bin_boundaries[1:]
+
+        bin_centers = []
+        bin_accuracies = []
+        bin_confidences = []
+        bin_counts = []
+
+        for bin_lower, bin_upper in zip(bin_lowers, bin_uppers, strict=False):
+            # Find samples in this bin
+            in_bin = (scores_array > bin_lower) & (scores_array <= bin_upper)
+            bin_count = in_bin.sum()
+
+            if bin_count > 0:
+                bin_center = (bin_lower + bin_upper) / 2
+                accuracy_in_bin = labels_array[in_bin].mean()
+                avg_confidence_in_bin = scores_array[in_bin].mean()
+
+                bin_centers.append(bin_center)
+                bin_accuracies.append(accuracy_in_bin)
+                bin_confidences.append(avg_confidence_in_bin)
+                bin_counts.append(bin_count)
+
+        return {
+            "bin_centers": bin_centers,
+            "bin_accuracies": bin_accuracies,
+            "bin_confidences": bin_confidences,
+            "bin_counts": bin_counts,
+        }
+
+
+# Default calibrators for common tasks
+DEFAULT_CALIBRATORS = {
+    "ocr_confidence": {"method": "temperature"},
+    "extraction_confidence": {"method": "platt"},
+    "rag_confidence": {"method": "isotonic"},
+    "calculation_confidence": {"method": "temperature"},
+    "overall_confidence": {"method": "platt"},
+}
--- a/libs/calibration/multi_model.py
+++ b/libs/calibration/multi_model.py
@@ -0,0 +1,85 @@
+"""Multi-model calibrator for handling multiple models/tasks."""
+
+import glob
+import os
+
+import structlog
+
+from .calibrator import ConfidenceCalibrator
+
+logger = structlog.get_logger()
+
+
+class MultiModelCalibrator:
+    """Calibrate confidence scores for multiple models/tasks"""
+
+    def __init__(self) -> None:
+        self.calibrators: dict[str, ConfidenceCalibrator] = {}
+
+    def add_calibrator(self, model_name: str, method: str = "temperature") -> None:
+        """Add calibrator for a specific model"""
+        self.calibrators[model_name] = ConfidenceCalibrator(method)
+        logger.info("Added calibrator", model=model_name, method=method)
+
+    def fit(self, model_name: str, scores: list[float], labels: list[bool]) -> None:
+        """Fit calibrator for specific model"""
+        if model_name not in self.calibrators:
+            self.add_calibrator(model_name)
+
+        self.calibrators[model_name].fit(scores, labels)
+
+    def calibrate(self, model_name: str, scores: list[float]) -> list[float]:
+        """Calibrate scores for specific model"""
+        if model_name not in self.calibrators:
+            logger.warning("No calibrator for model", model=model_name)
+            return scores
+
+        return self.calibrators[model_name].calibrate(scores)
+
+    def save_all(self, directory: str) -> None:
+        """Save all calibrators"""
+        os.makedirs(directory, exist_ok=True)
+
+        for model_name, calibrator in self.calibrators.items():
+            filepath = os.path.join(directory, f"{model_name}_calibrator.pkl")
+            calibrator.save_model(filepath)
+
+    def load_all(self, directory: str) -> None:
+        """Load all calibrators from directory"""
+        pattern = os.path.join(directory, "*_calibrator.pkl")
+        for filepath in glob.glob(pattern):
+            filename = os.path.basename(filepath)
+            model_name = filename.replace("_calibrator.pkl", "")
+
+            calibrator = ConfidenceCalibrator()
+            calibrator.load_model(filepath)
+            self.calibrators[model_name] = calibrator
+
+    def save_models(self, directory: str) -> None:
+        """Save all calibrators (alias for save_all)"""
+        self.save_all(directory)
+
+    def load_models(self, directory: str) -> None:
+        """Load all calibrators from directory (alias for load_all)"""
+        self.load_all(directory)
+
+    def get_model_names(self) -> list[str]:
+        """Get list of model names"""
+        return list(self.calibrators.keys())
+
+    def has_model(self, model_name: str) -> bool:
+        """Check if model exists"""
+        return model_name in self.calibrators
+
+    def is_fitted(self, model_name: str) -> bool:
+        """Check if model is fitted"""
+        if model_name not in self.calibrators:
+            raise ValueError(f"Model '{model_name}' not found")
+        return self.calibrators[model_name].is_fitted
+
+    def remove_calibrator(self, model_name: str) -> None:
+        """Remove calibrator for specific model"""
+        if model_name not in self.calibrators:
+            raise ValueError(f"Model '{model_name}' not found")
+        del self.calibrators[model_name]
+        logger.info("Removed calibrator", model=model_name)