ai-tax-agent/libs/observability/prometheus.py

"""Prometheus metrics setup and business metrics."""

from typing import Any

from prometheus_client import CollectorRegistry, Counter, Gauge, Histogram, Info
from prometheus_fastapi_instrumentator import Instrumentator


def init_prometheus_metrics(  # pylint: disable=unused-argument
    app: Any, service_name: str
) -> Any:
    """Initialize Prometheus metrics for FastAPI app"""

    # Create instrumentator
    instrumentator = Instrumentator(
        should_group_status_codes=False,
        should_ignore_untemplated=True,
        should_respect_env_var=True,
        should_instrument_requests_inprogress=True,
        excluded_handlers=["/metrics", "/healthz", "/readyz", "/livez"],
        env_var_name="ENABLE_METRICS",
        inprogress_name="http_requests_inprogress",
        inprogress_labels=True,
    )

    # Add custom metrics
    instrumentator.add(
        lambda info: info.modified_duration < 0.1,  # type: ignore
        lambda info: Counter(
            "http_requests_fast_total",
            "Number of fast HTTP requests (< 100ms)",
            ["method", "endpoint"],
        )
        .labels(method=info.method, endpoint=info.modified_handler)
        .inc(),
    )

    instrumentator.add(
        lambda info: info.modified_duration > 1.0,  # type: ignore
        lambda info: Counter(
            "http_requests_slow_total",
            "Number of slow HTTP requests (> 1s)",
            ["method", "endpoint"],
        )
        .labels(method=info.method, endpoint=info.modified_handler)
        .inc(),
    )

    # Instrument the app
    instrumentator.instrument(app)
    instrumentator.expose(app, endpoint="/metrics")

    return instrumentator


# Global registry for business metrics to avoid duplicates
_business_metrics_registry: dict[str, Any] = {}


# Custom metrics for business logic
class BusinessMetrics:  # pylint: disable=too-many-instance-attributes
    """Custom business metrics for the application"""

    def __init__(self, service_name: str):
        self.service_name = service_name
        # Sanitize service name for Prometheus metrics (replace hyphens with underscores)
        self.sanitized_name = service_name.replace("-", "_")

        # Create a custom registry for this service to avoid conflicts
        self.registry = CollectorRegistry()

        # Document processing metrics
        self.documents_processed = Counter(
            "documents_processed_total",
            "Total number of documents processed",
            ["service", "document_type", "status"],
            registry=self.registry,
        )

        # Add active connections metric for tests
        self.active_connections = Gauge(
            "active_connections",
            "Number of active connections",
            ["service"],
            registry=self.registry,
        )

        # Dynamic counters for forms service
        self._dynamic_counters: dict[str, Any] = {}

        self.document_processing_duration = Histogram(
            f"document_processing_duration_seconds_{self.sanitized_name}",
            "Time spent processing documents",
            ["service", "document_type"],
            buckets=[0.1, 0.5, 1.0, 2.5, 5.0, 10.0, 30.0, 60.0, 120.0, 300.0],
            registry=self.registry,
        )

        # Field extraction metrics
        self.field_extractions = Counter(
            f"field_extractions_total_{self.sanitized_name}",
            "Total number of field extractions",
            ["service", "field_type", "status"],
            registry=self.registry,
        )

        self.extraction_confidence = Histogram(
            f"extraction_confidence_score_{self.sanitized_name}",
            "Confidence scores for extractions",
            ["service", "extraction_type"],
            buckets=[0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
            registry=self.registry,
        )

        # Tax calculation metrics
        self.tax_calculations = Counter(
            f"tax_calculations_total_{self.sanitized_name}",
            "Total number of tax calculations",
            ["service", "calculation_type", "status"],
            registry=self.registry,
        )

        self.calculation_confidence = Histogram(
            f"calculation_confidence_score_{self.sanitized_name}",
            "Confidence scores for tax calculations",
            ["service", "calculation_type"],
            buckets=[0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
            registry=self.registry,
        )

        # RAG metrics
        self.rag_searches = Counter(
            f"rag_searches_total_{self.sanitized_name}",
            "Total number of RAG searches",
            ["service", "collection", "status"],
            registry=self.registry,
        )

        self.rag_search_duration = Histogram(
            f"rag_search_duration_seconds_{self.sanitized_name}",
            "Time spent on RAG searches",
            ["service", "collection"],
            buckets=[0.01, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0],
            registry=self.registry,
        )

        self.rag_relevance_score = Histogram(
            f"rag_relevance_score_{self.sanitized_name}",
            "RAG search relevance scores",
            ["service", "collection"],
            buckets=[0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
            registry=self.registry,
        )

        # Knowledge graph metrics
        self.kg_operations = Counter(
            f"kg_operations_total_{self.sanitized_name}",
            "Total number of KG operations",
            ["service", "operation", "status"],
            registry=self.registry,
        )

        self.kg_query_duration = Histogram(
            f"kg_query_duration_seconds_{self.sanitized_name}",
            "Time spent on KG queries",
            ["service", "query_type"],
            buckets=[0.01, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0],
            registry=self.registry,
        )

        # HMRC submission metrics
        self.hmrc_submissions = Counter(
            f"hmrc_submissions_total_{self.sanitized_name}",
            "Total number of HMRC submissions",
            ["service", "submission_type", "status"],
            registry=self.registry,
        )

        # Service health metrics
        self.service_info = Info(
            f"service_info_{self.sanitized_name}",
            "Service information",
            registry=self.registry,
        )
        try:
            self.service_info.info({"service": service_name, "version": "1.0.0"})
        except (AttributeError, ValueError):
            # Handle prometheus_client version compatibility or registry conflicts
            pass

    def counter(self, name: str, labelnames: list[str] | None = None) -> Any:
        """Get or create a counter metric with dynamic labels"""
        # Use provided labelnames or default ones
        if labelnames is None:
            labelnames = ["tenant_id", "form_id", "scope", "error_type"]

        # Create a unique key based on name and labelnames
        label_key = f"{name}_{','.join(sorted(labelnames))}"

        if label_key not in self._dynamic_counters:
            self._dynamic_counters[label_key] = Counter(
                name,
                f"Dynamic counter: {name}",
                labelnames=labelnames,
                registry=self.registry,
            )
        return self._dynamic_counters[label_key]

    def histogram(self, name: str, labelnames: list[str] | None = None) -> Any:
        """Get or create a histogram metric with dynamic labels"""
        # Use provided labelnames or default ones
        if labelnames is None:
            labelnames = ["tenant_id", "kind"]

        # Create a unique key based on name and labelnames
        label_key = f"{name}_{','.join(sorted(labelnames))}"
        histogram_key = f"_histogram_{label_key}"

        if not hasattr(self, histogram_key):
            histogram = Histogram(
                name,
                f"Dynamic histogram: {name}",
                labelnames=labelnames,
                registry=self.registry,
            )
            setattr(self, histogram_key, histogram)
        return getattr(self, histogram_key)


def get_business_metrics(service_name: str) -> BusinessMetrics:
    """Get business metrics instance for service"""
    # Use singleton pattern to avoid registry conflicts
    if service_name not in _business_metrics_registry:
        _business_metrics_registry[service_name] = BusinessMetrics(service_name)
    return _business_metrics_registry[service_name]  # type: ignore