Initial commit
Some checks failed
CI/CD Pipeline / Code Quality & Linting (push) Has been cancelled
CI/CD Pipeline / Policy Validation (push) Has been cancelled
CI/CD Pipeline / Test Suite (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-firm-connectors) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-forms) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-hmrc) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ingestion) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-normalize-map) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ocr) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-indexer) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-reason) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rpa) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (ui-review) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (ui-review) (push) Has been cancelled
CI/CD Pipeline / Generate SBOM (push) Has been cancelled
CI/CD Pipeline / Deploy to Staging (push) Has been cancelled
CI/CD Pipeline / Deploy to Production (push) Has been cancelled
CI/CD Pipeline / Notifications (push) Has been cancelled
Some checks failed
CI/CD Pipeline / Code Quality & Linting (push) Has been cancelled
CI/CD Pipeline / Policy Validation (push) Has been cancelled
CI/CD Pipeline / Test Suite (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-firm-connectors) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-forms) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-hmrc) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ingestion) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-normalize-map) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ocr) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-indexer) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-reason) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rpa) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (ui-review) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (ui-review) (push) Has been cancelled
CI/CD Pipeline / Generate SBOM (push) Has been cancelled
CI/CD Pipeline / Deploy to Staging (push) Has been cancelled
CI/CD Pipeline / Deploy to Production (push) Has been cancelled
CI/CD Pipeline / Notifications (push) Has been cancelled
This commit is contained in:
12
libs/calibration/__init__.py
Normal file
12
libs/calibration/__init__.py
Normal file
@@ -0,0 +1,12 @@
|
||||
"""Confidence calibration for ML models."""
|
||||
|
||||
from .calibrator import ConfidenceCalibrator
|
||||
from .metrics import DEFAULT_CALIBRATORS, ConfidenceMetrics
|
||||
from .multi_model import MultiModelCalibrator
|
||||
|
||||
__all__ = [
|
||||
"ConfidenceCalibrator",
|
||||
"MultiModelCalibrator",
|
||||
"ConfidenceMetrics",
|
||||
"DEFAULT_CALIBRATORS",
|
||||
]
|
||||
190
libs/calibration/calibrator.py
Normal file
190
libs/calibration/calibrator.py
Normal file
@@ -0,0 +1,190 @@
|
||||
"""Confidence calibrator using various methods."""
|
||||
|
||||
import pickle
|
||||
|
||||
import numpy as np
|
||||
import structlog
|
||||
from sklearn.isotonic import IsotonicRegression
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
|
||||
logger = structlog.get_logger()
|
||||
|
||||
|
||||
class ConfidenceCalibrator:
|
||||
"""Calibrate confidence scores using various methods"""
|
||||
|
||||
def __init__(self, method: str = "temperature"):
|
||||
"""
|
||||
Initialize calibrator
|
||||
|
||||
Args:
|
||||
method: Calibration method ('temperature', 'platt', 'isotonic')
|
||||
"""
|
||||
self.method = method
|
||||
self.calibrator = None
|
||||
self.temperature = 1.0
|
||||
self.is_fitted = False
|
||||
|
||||
def fit(self, scores: list[float], labels: list[bool]) -> None:
|
||||
"""
|
||||
Fit calibration model
|
||||
|
||||
Args:
|
||||
scores: Raw confidence scores (0-1)
|
||||
labels: True labels (True/False for correct/incorrect)
|
||||
"""
|
||||
# Validate inputs
|
||||
if len(scores) == 0 or len(labels) == 0:
|
||||
raise ValueError("Scores and labels cannot be empty")
|
||||
|
||||
if len(scores) != len(labels):
|
||||
raise ValueError("Scores and labels must have the same length")
|
||||
|
||||
scores_array = np.array(scores).reshape(-1, 1)
|
||||
labels_array = np.array(labels, dtype=int)
|
||||
|
||||
if self.method == "temperature":
|
||||
self._fit_temperature_scaling(scores_array, labels_array)
|
||||
elif self.method == "platt":
|
||||
self._fit_platt_scaling(scores_array, labels_array)
|
||||
elif self.method == "isotonic":
|
||||
self._fit_isotonic_regression(scores_array, labels_array)
|
||||
else:
|
||||
raise ValueError(f"Unknown calibration method: {self.method}")
|
||||
|
||||
self.is_fitted = True
|
||||
logger.info("Calibrator fitted", method=self.method)
|
||||
|
||||
def _fit_temperature_scaling(self, scores: np.ndarray, labels: np.ndarray) -> None:
|
||||
"""Fit temperature scaling parameter"""
|
||||
# pylint: disable=import-outside-toplevel
|
||||
from scipy.optimize import minimize_scalar
|
||||
|
||||
def negative_log_likelihood(temperature: float) -> float:
|
||||
# Convert scores to logits
|
||||
epsilon = 1e-7
|
||||
scores_clipped = np.clip(scores.flatten(), epsilon, 1 - epsilon)
|
||||
logits = np.log(scores_clipped / (1 - scores_clipped))
|
||||
|
||||
# Apply temperature scaling
|
||||
calibrated_logits = logits / temperature
|
||||
calibrated_probs = 1 / (1 + np.exp(-calibrated_logits))
|
||||
|
||||
# Calculate negative log likelihood
|
||||
nll = -np.mean(
|
||||
labels * np.log(calibrated_probs + epsilon)
|
||||
+ (1 - labels) * np.log(1 - calibrated_probs + epsilon)
|
||||
)
|
||||
return float(nll)
|
||||
|
||||
# Find optimal temperature
|
||||
result = minimize_scalar( # type: ignore
|
||||
negative_log_likelihood,
|
||||
bounds=(0.1, 10.0),
|
||||
method="bounded", # fmt: skip # pyright: ignore[reportArgumentType]
|
||||
)
|
||||
self.temperature = result.x
|
||||
|
||||
logger.debug("Temperature scaling fitted", temperature=self.temperature)
|
||||
|
||||
def _fit_platt_scaling(self, scores: np.ndarray, labels: np.ndarray) -> None:
|
||||
"""Fit Platt scaling (logistic regression)"""
|
||||
# Convert scores to logits
|
||||
epsilon = 1e-7
|
||||
scores_clipped = np.clip(scores.flatten(), epsilon, 1 - epsilon)
|
||||
logits = np.log(scores_clipped / (1 - scores_clipped)).reshape(-1, 1)
|
||||
|
||||
# Fit logistic regression
|
||||
self.calibrator = LogisticRegression()
|
||||
self.calibrator.fit(logits, labels) # type: ignore
|
||||
|
||||
logger.debug("Platt scaling fitted")
|
||||
|
||||
def _fit_isotonic_regression(self, scores: np.ndarray, labels: np.ndarray) -> None:
|
||||
"""Fit isotonic regression"""
|
||||
self.calibrator = IsotonicRegression(out_of_bounds="clip")
|
||||
self.calibrator.fit(scores.flatten(), labels) # type: ignore
|
||||
|
||||
logger.debug("Isotonic regression fitted")
|
||||
|
||||
def calibrate(self, scores: list[float]) -> list[float]:
|
||||
"""
|
||||
Calibrate confidence scores
|
||||
|
||||
Args:
|
||||
scores: Raw confidence scores
|
||||
|
||||
Returns:
|
||||
Calibrated confidence scores
|
||||
"""
|
||||
if not self.is_fitted:
|
||||
logger.warning("Calibrator not fitted, returning original scores")
|
||||
return scores
|
||||
|
||||
scores_array = np.array(scores)
|
||||
|
||||
if self.method == "temperature":
|
||||
return self._calibrate_temperature(scores_array)
|
||||
if self.method == "platt":
|
||||
return self._calibrate_platt(scores_array)
|
||||
if self.method == "isotonic":
|
||||
return self._calibrate_isotonic(scores_array)
|
||||
return scores
|
||||
|
||||
def _calibrate_temperature(self, scores: np.ndarray) -> list[float]:
|
||||
"""Apply temperature scaling"""
|
||||
epsilon = 1e-7
|
||||
scores_clipped = np.clip(scores, epsilon, 1 - epsilon)
|
||||
|
||||
# Convert to logits
|
||||
logits = np.log(scores_clipped / (1 - scores_clipped))
|
||||
|
||||
# Apply temperature scaling
|
||||
calibrated_logits = logits / self.temperature
|
||||
calibrated_probs = 1 / (1 + np.exp(-calibrated_logits))
|
||||
|
||||
return calibrated_probs.tolist() # type: ignore
|
||||
|
||||
def _calibrate_platt(self, scores: np.ndarray) -> list[float]:
|
||||
"""Apply Platt scaling"""
|
||||
epsilon = 1e-7
|
||||
scores_clipped = np.clip(scores, epsilon, 1 - epsilon)
|
||||
|
||||
# Convert to logits
|
||||
logits = np.log(scores_clipped / (1 - scores_clipped)).reshape(-1, 1)
|
||||
|
||||
# Apply Platt scaling
|
||||
calibrated_probs = self.calibrator.predict_proba(logits)[:, 1] # type: ignore
|
||||
|
||||
return calibrated_probs.tolist() # type: ignore
|
||||
|
||||
def _calibrate_isotonic(self, scores: np.ndarray) -> list[float]:
|
||||
"""Apply isotonic regression"""
|
||||
calibrated_probs = self.calibrator.predict(scores) # type: ignore
|
||||
return calibrated_probs.tolist() # type: ignore
|
||||
|
||||
def save_model(self, filepath: str) -> None:
|
||||
"""Save calibration model"""
|
||||
model_data = {
|
||||
"method": self.method,
|
||||
"temperature": self.temperature,
|
||||
"calibrator": self.calibrator,
|
||||
"is_fitted": self.is_fitted,
|
||||
}
|
||||
|
||||
with open(filepath, "wb") as f:
|
||||
pickle.dump(model_data, f)
|
||||
|
||||
logger.info("Calibration model saved", filepath=filepath)
|
||||
|
||||
def load_model(self, filepath: str) -> None:
|
||||
"""Load calibration model"""
|
||||
with open(filepath, "rb") as f:
|
||||
model_data = pickle.load(f)
|
||||
|
||||
self.method = model_data["method"]
|
||||
self.temperature = model_data["temperature"]
|
||||
self.calibrator = model_data["calibrator"]
|
||||
self.is_fitted = model_data["is_fitted"]
|
||||
|
||||
logger.info("Calibration model loaded", filepath=filepath, method=self.method)
|
||||
144
libs/calibration/metrics.py
Normal file
144
libs/calibration/metrics.py
Normal file
@@ -0,0 +1,144 @@
|
||||
"""Calibration metrics for evaluating confidence calibration."""
|
||||
|
||||
import numpy as np
|
||||
|
||||
|
||||
class ConfidenceMetrics:
|
||||
"""Calculate calibration metrics"""
|
||||
|
||||
@staticmethod
|
||||
def expected_calibration_error(
|
||||
scores: list[float], labels: list[bool], n_bins: int = 10
|
||||
) -> float:
|
||||
"""
|
||||
Calculate Expected Calibration Error (ECE)
|
||||
|
||||
Args:
|
||||
scores: Predicted confidence scores
|
||||
labels: True labels
|
||||
n_bins: Number of bins for calibration
|
||||
|
||||
Returns:
|
||||
ECE value
|
||||
"""
|
||||
scores_array = np.array(scores)
|
||||
labels_array = np.array(labels, dtype=int)
|
||||
|
||||
bin_boundaries = np.linspace(0, 1, n_bins + 1)
|
||||
bin_lowers = bin_boundaries[:-1]
|
||||
bin_uppers = bin_boundaries[1:]
|
||||
|
||||
ece = 0
|
||||
|
||||
for bin_lower, bin_upper in zip(bin_lowers, bin_uppers, strict=False):
|
||||
# Find samples in this bin
|
||||
in_bin = (scores_array > bin_lower) & (scores_array <= bin_upper)
|
||||
prop_in_bin = in_bin.mean()
|
||||
|
||||
if prop_in_bin > 0:
|
||||
# Calculate accuracy and confidence in this bin
|
||||
accuracy_in_bin = labels_array[in_bin].mean()
|
||||
avg_confidence_in_bin = scores_array[in_bin].mean()
|
||||
|
||||
# Add to ECE
|
||||
ece += np.abs(avg_confidence_in_bin - accuracy_in_bin) * prop_in_bin
|
||||
|
||||
return ece
|
||||
|
||||
@staticmethod
|
||||
def maximum_calibration_error(
|
||||
scores: list[float], labels: list[bool], n_bins: int = 10
|
||||
) -> float:
|
||||
"""
|
||||
Calculate Maximum Calibration Error (MCE)
|
||||
|
||||
Args:
|
||||
scores: Predicted confidence scores
|
||||
labels: True labels
|
||||
n_bins: Number of bins for calibration
|
||||
|
||||
Returns:
|
||||
MCE value
|
||||
"""
|
||||
scores_array = np.array(scores)
|
||||
labels_array = np.array(labels, dtype=int)
|
||||
|
||||
bin_boundaries = np.linspace(0, 1, n_bins + 1)
|
||||
bin_lowers = bin_boundaries[:-1]
|
||||
bin_uppers = bin_boundaries[1:]
|
||||
|
||||
max_error = 0
|
||||
|
||||
for bin_lower, bin_upper in zip(bin_lowers, bin_uppers, strict=False):
|
||||
# Find samples in this bin
|
||||
in_bin = (scores_array > bin_lower) & (scores_array <= bin_upper)
|
||||
|
||||
if in_bin.sum() > 0:
|
||||
# Calculate accuracy and confidence in this bin
|
||||
accuracy_in_bin = labels_array[in_bin].mean()
|
||||
avg_confidence_in_bin = scores_array[in_bin].mean()
|
||||
|
||||
# Update maximum error
|
||||
error = np.abs(avg_confidence_in_bin - accuracy_in_bin)
|
||||
max_error = max(max_error, error)
|
||||
|
||||
return max_error
|
||||
|
||||
@staticmethod
|
||||
def reliability_diagram_data( # pylint: disable=too-many-locals
|
||||
scores: list[float], labels: list[bool], n_bins: int = 10
|
||||
) -> dict[str, list[float]]:
|
||||
"""
|
||||
Generate data for reliability diagram
|
||||
|
||||
Args:
|
||||
scores: Predicted confidence scores
|
||||
labels: True labels
|
||||
n_bins: Number of bins
|
||||
|
||||
Returns:
|
||||
Dictionary with bin data for plotting
|
||||
"""
|
||||
scores_array = np.array(scores)
|
||||
labels_array = np.array(labels, dtype=int)
|
||||
|
||||
bin_boundaries = np.linspace(0, 1, n_bins + 1)
|
||||
bin_lowers = bin_boundaries[:-1]
|
||||
bin_uppers = bin_boundaries[1:]
|
||||
|
||||
bin_centers = []
|
||||
bin_accuracies = []
|
||||
bin_confidences = []
|
||||
bin_counts = []
|
||||
|
||||
for bin_lower, bin_upper in zip(bin_lowers, bin_uppers, strict=False):
|
||||
# Find samples in this bin
|
||||
in_bin = (scores_array > bin_lower) & (scores_array <= bin_upper)
|
||||
bin_count = in_bin.sum()
|
||||
|
||||
if bin_count > 0:
|
||||
bin_center = (bin_lower + bin_upper) / 2
|
||||
accuracy_in_bin = labels_array[in_bin].mean()
|
||||
avg_confidence_in_bin = scores_array[in_bin].mean()
|
||||
|
||||
bin_centers.append(bin_center)
|
||||
bin_accuracies.append(accuracy_in_bin)
|
||||
bin_confidences.append(avg_confidence_in_bin)
|
||||
bin_counts.append(bin_count)
|
||||
|
||||
return {
|
||||
"bin_centers": bin_centers,
|
||||
"bin_accuracies": bin_accuracies,
|
||||
"bin_confidences": bin_confidences,
|
||||
"bin_counts": bin_counts,
|
||||
}
|
||||
|
||||
|
||||
# Default calibrators for common tasks
|
||||
DEFAULT_CALIBRATORS = {
|
||||
"ocr_confidence": {"method": "temperature"},
|
||||
"extraction_confidence": {"method": "platt"},
|
||||
"rag_confidence": {"method": "isotonic"},
|
||||
"calculation_confidence": {"method": "temperature"},
|
||||
"overall_confidence": {"method": "platt"},
|
||||
}
|
||||
85
libs/calibration/multi_model.py
Normal file
85
libs/calibration/multi_model.py
Normal file
@@ -0,0 +1,85 @@
|
||||
"""Multi-model calibrator for handling multiple models/tasks."""
|
||||
|
||||
import glob
|
||||
import os
|
||||
|
||||
import structlog
|
||||
|
||||
from .calibrator import ConfidenceCalibrator
|
||||
|
||||
logger = structlog.get_logger()
|
||||
|
||||
|
||||
class MultiModelCalibrator:
|
||||
"""Calibrate confidence scores for multiple models/tasks"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
self.calibrators: dict[str, ConfidenceCalibrator] = {}
|
||||
|
||||
def add_calibrator(self, model_name: str, method: str = "temperature") -> None:
|
||||
"""Add calibrator for a specific model"""
|
||||
self.calibrators[model_name] = ConfidenceCalibrator(method)
|
||||
logger.info("Added calibrator", model=model_name, method=method)
|
||||
|
||||
def fit(self, model_name: str, scores: list[float], labels: list[bool]) -> None:
|
||||
"""Fit calibrator for specific model"""
|
||||
if model_name not in self.calibrators:
|
||||
self.add_calibrator(model_name)
|
||||
|
||||
self.calibrators[model_name].fit(scores, labels)
|
||||
|
||||
def calibrate(self, model_name: str, scores: list[float]) -> list[float]:
|
||||
"""Calibrate scores for specific model"""
|
||||
if model_name not in self.calibrators:
|
||||
logger.warning("No calibrator for model", model=model_name)
|
||||
return scores
|
||||
|
||||
return self.calibrators[model_name].calibrate(scores)
|
||||
|
||||
def save_all(self, directory: str) -> None:
|
||||
"""Save all calibrators"""
|
||||
os.makedirs(directory, exist_ok=True)
|
||||
|
||||
for model_name, calibrator in self.calibrators.items():
|
||||
filepath = os.path.join(directory, f"{model_name}_calibrator.pkl")
|
||||
calibrator.save_model(filepath)
|
||||
|
||||
def load_all(self, directory: str) -> None:
|
||||
"""Load all calibrators from directory"""
|
||||
pattern = os.path.join(directory, "*_calibrator.pkl")
|
||||
for filepath in glob.glob(pattern):
|
||||
filename = os.path.basename(filepath)
|
||||
model_name = filename.replace("_calibrator.pkl", "")
|
||||
|
||||
calibrator = ConfidenceCalibrator()
|
||||
calibrator.load_model(filepath)
|
||||
self.calibrators[model_name] = calibrator
|
||||
|
||||
def save_models(self, directory: str) -> None:
|
||||
"""Save all calibrators (alias for save_all)"""
|
||||
self.save_all(directory)
|
||||
|
||||
def load_models(self, directory: str) -> None:
|
||||
"""Load all calibrators from directory (alias for load_all)"""
|
||||
self.load_all(directory)
|
||||
|
||||
def get_model_names(self) -> list[str]:
|
||||
"""Get list of model names"""
|
||||
return list(self.calibrators.keys())
|
||||
|
||||
def has_model(self, model_name: str) -> bool:
|
||||
"""Check if model exists"""
|
||||
return model_name in self.calibrators
|
||||
|
||||
def is_fitted(self, model_name: str) -> bool:
|
||||
"""Check if model is fitted"""
|
||||
if model_name not in self.calibrators:
|
||||
raise ValueError(f"Model '{model_name}' not found")
|
||||
return self.calibrators[model_name].is_fitted
|
||||
|
||||
def remove_calibrator(self, model_name: str) -> None:
|
||||
"""Remove calibrator for specific model"""
|
||||
if model_name not in self.calibrators:
|
||||
raise ValueError(f"Model '{model_name}' not found")
|
||||
del self.calibrators[model_name]
|
||||
logger.info("Removed calibrator", model=model_name)
|
||||
Reference in New Issue
Block a user