Files
ai-tax-agent/libs/observability/prometheus.py
harkon b324ff09ef
Some checks failed
CI/CD Pipeline / Code Quality & Linting (push) Has been cancelled
CI/CD Pipeline / Policy Validation (push) Has been cancelled
CI/CD Pipeline / Test Suite (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-firm-connectors) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-forms) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-hmrc) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ingestion) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-normalize-map) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ocr) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-indexer) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-reason) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rpa) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (ui-review) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (ui-review) (push) Has been cancelled
CI/CD Pipeline / Generate SBOM (push) Has been cancelled
CI/CD Pipeline / Deploy to Staging (push) Has been cancelled
CI/CD Pipeline / Deploy to Production (push) Has been cancelled
CI/CD Pipeline / Notifications (push) Has been cancelled
Initial commit
2025-10-11 08:41:36 +01:00

236 lines
8.4 KiB
Python

"""Prometheus metrics setup and business metrics."""
from typing import Any
from prometheus_client import CollectorRegistry, Counter, Gauge, Histogram, Info
from prometheus_fastapi_instrumentator import Instrumentator
def init_prometheus_metrics( # pylint: disable=unused-argument
app: Any, service_name: str
) -> Any:
"""Initialize Prometheus metrics for FastAPI app"""
# Create instrumentator
instrumentator = Instrumentator(
should_group_status_codes=False,
should_ignore_untemplated=True,
should_respect_env_var=True,
should_instrument_requests_inprogress=True,
excluded_handlers=["/metrics", "/healthz", "/readyz", "/livez"],
env_var_name="ENABLE_METRICS",
inprogress_name="http_requests_inprogress",
inprogress_labels=True,
)
# Add custom metrics
instrumentator.add(
lambda info: info.modified_duration < 0.1, # type: ignore
lambda info: Counter(
"http_requests_fast_total",
"Number of fast HTTP requests (< 100ms)",
["method", "endpoint"],
)
.labels(method=info.method, endpoint=info.modified_handler)
.inc(),
)
instrumentator.add(
lambda info: info.modified_duration > 1.0, # type: ignore
lambda info: Counter(
"http_requests_slow_total",
"Number of slow HTTP requests (> 1s)",
["method", "endpoint"],
)
.labels(method=info.method, endpoint=info.modified_handler)
.inc(),
)
# Instrument the app
instrumentator.instrument(app)
instrumentator.expose(app, endpoint="/metrics")
return instrumentator
# Global registry for business metrics to avoid duplicates
_business_metrics_registry: dict[str, Any] = {}
# Custom metrics for business logic
class BusinessMetrics: # pylint: disable=too-many-instance-attributes
"""Custom business metrics for the application"""
def __init__(self, service_name: str):
self.service_name = service_name
# Sanitize service name for Prometheus metrics (replace hyphens with underscores)
self.sanitized_name = service_name.replace("-", "_")
# Create a custom registry for this service to avoid conflicts
self.registry = CollectorRegistry()
# Document processing metrics
self.documents_processed = Counter(
"documents_processed_total",
"Total number of documents processed",
["service", "document_type", "status"],
registry=self.registry,
)
# Add active connections metric for tests
self.active_connections = Gauge(
"active_connections",
"Number of active connections",
["service"],
registry=self.registry,
)
# Dynamic counters for forms service
self._dynamic_counters: dict[str, Any] = {}
self.document_processing_duration = Histogram(
f"document_processing_duration_seconds_{self.sanitized_name}",
"Time spent processing documents",
["service", "document_type"],
buckets=[0.1, 0.5, 1.0, 2.5, 5.0, 10.0, 30.0, 60.0, 120.0, 300.0],
registry=self.registry,
)
# Field extraction metrics
self.field_extractions = Counter(
f"field_extractions_total_{self.sanitized_name}",
"Total number of field extractions",
["service", "field_type", "status"],
registry=self.registry,
)
self.extraction_confidence = Histogram(
f"extraction_confidence_score_{self.sanitized_name}",
"Confidence scores for extractions",
["service", "extraction_type"],
buckets=[0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
registry=self.registry,
)
# Tax calculation metrics
self.tax_calculations = Counter(
f"tax_calculations_total_{self.sanitized_name}",
"Total number of tax calculations",
["service", "calculation_type", "status"],
registry=self.registry,
)
self.calculation_confidence = Histogram(
f"calculation_confidence_score_{self.sanitized_name}",
"Confidence scores for tax calculations",
["service", "calculation_type"],
buckets=[0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
registry=self.registry,
)
# RAG metrics
self.rag_searches = Counter(
f"rag_searches_total_{self.sanitized_name}",
"Total number of RAG searches",
["service", "collection", "status"],
registry=self.registry,
)
self.rag_search_duration = Histogram(
f"rag_search_duration_seconds_{self.sanitized_name}",
"Time spent on RAG searches",
["service", "collection"],
buckets=[0.01, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0],
registry=self.registry,
)
self.rag_relevance_score = Histogram(
f"rag_relevance_score_{self.sanitized_name}",
"RAG search relevance scores",
["service", "collection"],
buckets=[0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
registry=self.registry,
)
# Knowledge graph metrics
self.kg_operations = Counter(
f"kg_operations_total_{self.sanitized_name}",
"Total number of KG operations",
["service", "operation", "status"],
registry=self.registry,
)
self.kg_query_duration = Histogram(
f"kg_query_duration_seconds_{self.sanitized_name}",
"Time spent on KG queries",
["service", "query_type"],
buckets=[0.01, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0],
registry=self.registry,
)
# HMRC submission metrics
self.hmrc_submissions = Counter(
f"hmrc_submissions_total_{self.sanitized_name}",
"Total number of HMRC submissions",
["service", "submission_type", "status"],
registry=self.registry,
)
# Service health metrics
self.service_info = Info(
f"service_info_{self.sanitized_name}",
"Service information",
registry=self.registry,
)
try:
self.service_info.info({"service": service_name, "version": "1.0.0"})
except (AttributeError, ValueError):
# Handle prometheus_client version compatibility or registry conflicts
pass
def counter(self, name: str, labelnames: list[str] | None = None) -> Any:
"""Get or create a counter metric with dynamic labels"""
# Use provided labelnames or default ones
if labelnames is None:
labelnames = ["tenant_id", "form_id", "scope", "error_type"]
# Create a unique key based on name and labelnames
label_key = f"{name}_{','.join(sorted(labelnames))}"
if label_key not in self._dynamic_counters:
self._dynamic_counters[label_key] = Counter(
name,
f"Dynamic counter: {name}",
labelnames=labelnames,
registry=self.registry,
)
return self._dynamic_counters[label_key]
def histogram(self, name: str, labelnames: list[str] | None = None) -> Any:
"""Get or create a histogram metric with dynamic labels"""
# Use provided labelnames or default ones
if labelnames is None:
labelnames = ["tenant_id", "kind"]
# Create a unique key based on name and labelnames
label_key = f"{name}_{','.join(sorted(labelnames))}"
histogram_key = f"_histogram_{label_key}"
if not hasattr(self, histogram_key):
histogram = Histogram(
name,
f"Dynamic histogram: {name}",
labelnames=labelnames,
registry=self.registry,
)
setattr(self, histogram_key, histogram)
return getattr(self, histogram_key)
def get_business_metrics(service_name: str) -> BusinessMetrics:
"""Get business metrics instance for service"""
# Use singleton pattern to avoid registry conflicts
if service_name not in _business_metrics_registry:
_business_metrics_registry[service_name] = BusinessMetrics(service_name)
return _business_metrics_registry[service_name] # type: ignore