Initial commit
Some checks failed
CI/CD Pipeline / Code Quality & Linting (push) Has been cancelled
CI/CD Pipeline / Policy Validation (push) Has been cancelled
CI/CD Pipeline / Test Suite (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-firm-connectors) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-forms) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-hmrc) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ingestion) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-normalize-map) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ocr) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-indexer) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-reason) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rpa) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (ui-review) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (ui-review) (push) Has been cancelled
CI/CD Pipeline / Generate SBOM (push) Has been cancelled
CI/CD Pipeline / Deploy to Staging (push) Has been cancelled
CI/CD Pipeline / Deploy to Production (push) Has been cancelled
CI/CD Pipeline / Notifications (push) Has been cancelled

This commit is contained in:
harkon
2025-10-11 08:41:36 +01:00
commit b324ff09ef
276 changed files with 55220 additions and 0 deletions

View File

@@ -0,0 +1,12 @@
"""Confidence calibration for ML models."""
from .calibrator import ConfidenceCalibrator
from .metrics import DEFAULT_CALIBRATORS, ConfidenceMetrics
from .multi_model import MultiModelCalibrator
__all__ = [
"ConfidenceCalibrator",
"MultiModelCalibrator",
"ConfidenceMetrics",
"DEFAULT_CALIBRATORS",
]

View File

@@ -0,0 +1,190 @@
"""Confidence calibrator using various methods."""
import pickle
import numpy as np
import structlog
from sklearn.isotonic import IsotonicRegression
from sklearn.linear_model import LogisticRegression
logger = structlog.get_logger()
class ConfidenceCalibrator:
"""Calibrate confidence scores using various methods"""
def __init__(self, method: str = "temperature"):
"""
Initialize calibrator
Args:
method: Calibration method ('temperature', 'platt', 'isotonic')
"""
self.method = method
self.calibrator = None
self.temperature = 1.0
self.is_fitted = False
def fit(self, scores: list[float], labels: list[bool]) -> None:
"""
Fit calibration model
Args:
scores: Raw confidence scores (0-1)
labels: True labels (True/False for correct/incorrect)
"""
# Validate inputs
if len(scores) == 0 or len(labels) == 0:
raise ValueError("Scores and labels cannot be empty")
if len(scores) != len(labels):
raise ValueError("Scores and labels must have the same length")
scores_array = np.array(scores).reshape(-1, 1)
labels_array = np.array(labels, dtype=int)
if self.method == "temperature":
self._fit_temperature_scaling(scores_array, labels_array)
elif self.method == "platt":
self._fit_platt_scaling(scores_array, labels_array)
elif self.method == "isotonic":
self._fit_isotonic_regression(scores_array, labels_array)
else:
raise ValueError(f"Unknown calibration method: {self.method}")
self.is_fitted = True
logger.info("Calibrator fitted", method=self.method)
def _fit_temperature_scaling(self, scores: np.ndarray, labels: np.ndarray) -> None:
"""Fit temperature scaling parameter"""
# pylint: disable=import-outside-toplevel
from scipy.optimize import minimize_scalar
def negative_log_likelihood(temperature: float) -> float:
# Convert scores to logits
epsilon = 1e-7
scores_clipped = np.clip(scores.flatten(), epsilon, 1 - epsilon)
logits = np.log(scores_clipped / (1 - scores_clipped))
# Apply temperature scaling
calibrated_logits = logits / temperature
calibrated_probs = 1 / (1 + np.exp(-calibrated_logits))
# Calculate negative log likelihood
nll = -np.mean(
labels * np.log(calibrated_probs + epsilon)
+ (1 - labels) * np.log(1 - calibrated_probs + epsilon)
)
return float(nll)
# Find optimal temperature
result = minimize_scalar( # type: ignore
negative_log_likelihood,
bounds=(0.1, 10.0),
method="bounded", # fmt: skip # pyright: ignore[reportArgumentType]
)
self.temperature = result.x
logger.debug("Temperature scaling fitted", temperature=self.temperature)
def _fit_platt_scaling(self, scores: np.ndarray, labels: np.ndarray) -> None:
"""Fit Platt scaling (logistic regression)"""
# Convert scores to logits
epsilon = 1e-7
scores_clipped = np.clip(scores.flatten(), epsilon, 1 - epsilon)
logits = np.log(scores_clipped / (1 - scores_clipped)).reshape(-1, 1)
# Fit logistic regression
self.calibrator = LogisticRegression()
self.calibrator.fit(logits, labels) # type: ignore
logger.debug("Platt scaling fitted")
def _fit_isotonic_regression(self, scores: np.ndarray, labels: np.ndarray) -> None:
"""Fit isotonic regression"""
self.calibrator = IsotonicRegression(out_of_bounds="clip")
self.calibrator.fit(scores.flatten(), labels) # type: ignore
logger.debug("Isotonic regression fitted")
def calibrate(self, scores: list[float]) -> list[float]:
"""
Calibrate confidence scores
Args:
scores: Raw confidence scores
Returns:
Calibrated confidence scores
"""
if not self.is_fitted:
logger.warning("Calibrator not fitted, returning original scores")
return scores
scores_array = np.array(scores)
if self.method == "temperature":
return self._calibrate_temperature(scores_array)
if self.method == "platt":
return self._calibrate_platt(scores_array)
if self.method == "isotonic":
return self._calibrate_isotonic(scores_array)
return scores
def _calibrate_temperature(self, scores: np.ndarray) -> list[float]:
"""Apply temperature scaling"""
epsilon = 1e-7
scores_clipped = np.clip(scores, epsilon, 1 - epsilon)
# Convert to logits
logits = np.log(scores_clipped / (1 - scores_clipped))
# Apply temperature scaling
calibrated_logits = logits / self.temperature
calibrated_probs = 1 / (1 + np.exp(-calibrated_logits))
return calibrated_probs.tolist() # type: ignore
def _calibrate_platt(self, scores: np.ndarray) -> list[float]:
"""Apply Platt scaling"""
epsilon = 1e-7
scores_clipped = np.clip(scores, epsilon, 1 - epsilon)
# Convert to logits
logits = np.log(scores_clipped / (1 - scores_clipped)).reshape(-1, 1)
# Apply Platt scaling
calibrated_probs = self.calibrator.predict_proba(logits)[:, 1] # type: ignore
return calibrated_probs.tolist() # type: ignore
def _calibrate_isotonic(self, scores: np.ndarray) -> list[float]:
"""Apply isotonic regression"""
calibrated_probs = self.calibrator.predict(scores) # type: ignore
return calibrated_probs.tolist() # type: ignore
def save_model(self, filepath: str) -> None:
"""Save calibration model"""
model_data = {
"method": self.method,
"temperature": self.temperature,
"calibrator": self.calibrator,
"is_fitted": self.is_fitted,
}
with open(filepath, "wb") as f:
pickle.dump(model_data, f)
logger.info("Calibration model saved", filepath=filepath)
def load_model(self, filepath: str) -> None:
"""Load calibration model"""
with open(filepath, "rb") as f:
model_data = pickle.load(f)
self.method = model_data["method"]
self.temperature = model_data["temperature"]
self.calibrator = model_data["calibrator"]
self.is_fitted = model_data["is_fitted"]
logger.info("Calibration model loaded", filepath=filepath, method=self.method)

144
libs/calibration/metrics.py Normal file
View File

@@ -0,0 +1,144 @@
"""Calibration metrics for evaluating confidence calibration."""
import numpy as np
class ConfidenceMetrics:
"""Calculate calibration metrics"""
@staticmethod
def expected_calibration_error(
scores: list[float], labels: list[bool], n_bins: int = 10
) -> float:
"""
Calculate Expected Calibration Error (ECE)
Args:
scores: Predicted confidence scores
labels: True labels
n_bins: Number of bins for calibration
Returns:
ECE value
"""
scores_array = np.array(scores)
labels_array = np.array(labels, dtype=int)
bin_boundaries = np.linspace(0, 1, n_bins + 1)
bin_lowers = bin_boundaries[:-1]
bin_uppers = bin_boundaries[1:]
ece = 0
for bin_lower, bin_upper in zip(bin_lowers, bin_uppers, strict=False):
# Find samples in this bin
in_bin = (scores_array > bin_lower) & (scores_array <= bin_upper)
prop_in_bin = in_bin.mean()
if prop_in_bin > 0:
# Calculate accuracy and confidence in this bin
accuracy_in_bin = labels_array[in_bin].mean()
avg_confidence_in_bin = scores_array[in_bin].mean()
# Add to ECE
ece += np.abs(avg_confidence_in_bin - accuracy_in_bin) * prop_in_bin
return ece
@staticmethod
def maximum_calibration_error(
scores: list[float], labels: list[bool], n_bins: int = 10
) -> float:
"""
Calculate Maximum Calibration Error (MCE)
Args:
scores: Predicted confidence scores
labels: True labels
n_bins: Number of bins for calibration
Returns:
MCE value
"""
scores_array = np.array(scores)
labels_array = np.array(labels, dtype=int)
bin_boundaries = np.linspace(0, 1, n_bins + 1)
bin_lowers = bin_boundaries[:-1]
bin_uppers = bin_boundaries[1:]
max_error = 0
for bin_lower, bin_upper in zip(bin_lowers, bin_uppers, strict=False):
# Find samples in this bin
in_bin = (scores_array > bin_lower) & (scores_array <= bin_upper)
if in_bin.sum() > 0:
# Calculate accuracy and confidence in this bin
accuracy_in_bin = labels_array[in_bin].mean()
avg_confidence_in_bin = scores_array[in_bin].mean()
# Update maximum error
error = np.abs(avg_confidence_in_bin - accuracy_in_bin)
max_error = max(max_error, error)
return max_error
@staticmethod
def reliability_diagram_data( # pylint: disable=too-many-locals
scores: list[float], labels: list[bool], n_bins: int = 10
) -> dict[str, list[float]]:
"""
Generate data for reliability diagram
Args:
scores: Predicted confidence scores
labels: True labels
n_bins: Number of bins
Returns:
Dictionary with bin data for plotting
"""
scores_array = np.array(scores)
labels_array = np.array(labels, dtype=int)
bin_boundaries = np.linspace(0, 1, n_bins + 1)
bin_lowers = bin_boundaries[:-1]
bin_uppers = bin_boundaries[1:]
bin_centers = []
bin_accuracies = []
bin_confidences = []
bin_counts = []
for bin_lower, bin_upper in zip(bin_lowers, bin_uppers, strict=False):
# Find samples in this bin
in_bin = (scores_array > bin_lower) & (scores_array <= bin_upper)
bin_count = in_bin.sum()
if bin_count > 0:
bin_center = (bin_lower + bin_upper) / 2
accuracy_in_bin = labels_array[in_bin].mean()
avg_confidence_in_bin = scores_array[in_bin].mean()
bin_centers.append(bin_center)
bin_accuracies.append(accuracy_in_bin)
bin_confidences.append(avg_confidence_in_bin)
bin_counts.append(bin_count)
return {
"bin_centers": bin_centers,
"bin_accuracies": bin_accuracies,
"bin_confidences": bin_confidences,
"bin_counts": bin_counts,
}
# Default calibrators for common tasks
DEFAULT_CALIBRATORS = {
"ocr_confidence": {"method": "temperature"},
"extraction_confidence": {"method": "platt"},
"rag_confidence": {"method": "isotonic"},
"calculation_confidence": {"method": "temperature"},
"overall_confidence": {"method": "platt"},
}

View File

@@ -0,0 +1,85 @@
"""Multi-model calibrator for handling multiple models/tasks."""
import glob
import os
import structlog
from .calibrator import ConfidenceCalibrator
logger = structlog.get_logger()
class MultiModelCalibrator:
"""Calibrate confidence scores for multiple models/tasks"""
def __init__(self) -> None:
self.calibrators: dict[str, ConfidenceCalibrator] = {}
def add_calibrator(self, model_name: str, method: str = "temperature") -> None:
"""Add calibrator for a specific model"""
self.calibrators[model_name] = ConfidenceCalibrator(method)
logger.info("Added calibrator", model=model_name, method=method)
def fit(self, model_name: str, scores: list[float], labels: list[bool]) -> None:
"""Fit calibrator for specific model"""
if model_name not in self.calibrators:
self.add_calibrator(model_name)
self.calibrators[model_name].fit(scores, labels)
def calibrate(self, model_name: str, scores: list[float]) -> list[float]:
"""Calibrate scores for specific model"""
if model_name not in self.calibrators:
logger.warning("No calibrator for model", model=model_name)
return scores
return self.calibrators[model_name].calibrate(scores)
def save_all(self, directory: str) -> None:
"""Save all calibrators"""
os.makedirs(directory, exist_ok=True)
for model_name, calibrator in self.calibrators.items():
filepath = os.path.join(directory, f"{model_name}_calibrator.pkl")
calibrator.save_model(filepath)
def load_all(self, directory: str) -> None:
"""Load all calibrators from directory"""
pattern = os.path.join(directory, "*_calibrator.pkl")
for filepath in glob.glob(pattern):
filename = os.path.basename(filepath)
model_name = filename.replace("_calibrator.pkl", "")
calibrator = ConfidenceCalibrator()
calibrator.load_model(filepath)
self.calibrators[model_name] = calibrator
def save_models(self, directory: str) -> None:
"""Save all calibrators (alias for save_all)"""
self.save_all(directory)
def load_models(self, directory: str) -> None:
"""Load all calibrators from directory (alias for load_all)"""
self.load_all(directory)
def get_model_names(self) -> list[str]:
"""Get list of model names"""
return list(self.calibrators.keys())
def has_model(self, model_name: str) -> bool:
"""Check if model exists"""
return model_name in self.calibrators
def is_fitted(self, model_name: str) -> bool:
"""Check if model is fitted"""
if model_name not in self.calibrators:
raise ValueError(f"Model '{model_name}' not found")
return self.calibrators[model_name].is_fitted
def remove_calibrator(self, model_name: str) -> None:
"""Remove calibrator for specific model"""
if model_name not in self.calibrators:
raise ValueError(f"Model '{model_name}' not found")
del self.calibrators[model_name]
logger.info("Removed calibrator", model=model_name)