Some checks failed
CI/CD Pipeline / Code Quality & Linting (push) Has been cancelled
CI/CD Pipeline / Policy Validation (push) Has been cancelled
CI/CD Pipeline / Test Suite (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-firm-connectors) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-forms) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-hmrc) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ingestion) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-normalize-map) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ocr) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-indexer) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-reason) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rpa) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (ui-review) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (ui-review) (push) Has been cancelled
CI/CD Pipeline / Generate SBOM (push) Has been cancelled
CI/CD Pipeline / Deploy to Staging (push) Has been cancelled
CI/CD Pipeline / Deploy to Production (push) Has been cancelled
CI/CD Pipeline / Notifications (push) Has been cancelled
236 lines
8.4 KiB
Python
236 lines
8.4 KiB
Python
"""Prometheus metrics setup and business metrics."""
|
|
|
|
from typing import Any
|
|
|
|
from prometheus_client import CollectorRegistry, Counter, Gauge, Histogram, Info
|
|
from prometheus_fastapi_instrumentator import Instrumentator
|
|
|
|
|
|
def init_prometheus_metrics( # pylint: disable=unused-argument
|
|
app: Any, service_name: str
|
|
) -> Any:
|
|
"""Initialize Prometheus metrics for FastAPI app"""
|
|
|
|
# Create instrumentator
|
|
instrumentator = Instrumentator(
|
|
should_group_status_codes=False,
|
|
should_ignore_untemplated=True,
|
|
should_respect_env_var=True,
|
|
should_instrument_requests_inprogress=True,
|
|
excluded_handlers=["/metrics", "/healthz", "/readyz", "/livez"],
|
|
env_var_name="ENABLE_METRICS",
|
|
inprogress_name="http_requests_inprogress",
|
|
inprogress_labels=True,
|
|
)
|
|
|
|
# Add custom metrics
|
|
instrumentator.add(
|
|
lambda info: info.modified_duration < 0.1, # type: ignore
|
|
lambda info: Counter(
|
|
"http_requests_fast_total",
|
|
"Number of fast HTTP requests (< 100ms)",
|
|
["method", "endpoint"],
|
|
)
|
|
.labels(method=info.method, endpoint=info.modified_handler)
|
|
.inc(),
|
|
)
|
|
|
|
instrumentator.add(
|
|
lambda info: info.modified_duration > 1.0, # type: ignore
|
|
lambda info: Counter(
|
|
"http_requests_slow_total",
|
|
"Number of slow HTTP requests (> 1s)",
|
|
["method", "endpoint"],
|
|
)
|
|
.labels(method=info.method, endpoint=info.modified_handler)
|
|
.inc(),
|
|
)
|
|
|
|
# Instrument the app
|
|
instrumentator.instrument(app)
|
|
instrumentator.expose(app, endpoint="/metrics")
|
|
|
|
return instrumentator
|
|
|
|
|
|
# Global registry for business metrics to avoid duplicates
|
|
_business_metrics_registry: dict[str, Any] = {}
|
|
|
|
|
|
# Custom metrics for business logic
|
|
class BusinessMetrics: # pylint: disable=too-many-instance-attributes
|
|
"""Custom business metrics for the application"""
|
|
|
|
def __init__(self, service_name: str):
|
|
self.service_name = service_name
|
|
# Sanitize service name for Prometheus metrics (replace hyphens with underscores)
|
|
self.sanitized_name = service_name.replace("-", "_")
|
|
|
|
# Create a custom registry for this service to avoid conflicts
|
|
self.registry = CollectorRegistry()
|
|
|
|
# Document processing metrics
|
|
self.documents_processed = Counter(
|
|
"documents_processed_total",
|
|
"Total number of documents processed",
|
|
["service", "document_type", "status"],
|
|
registry=self.registry,
|
|
)
|
|
|
|
# Add active connections metric for tests
|
|
self.active_connections = Gauge(
|
|
"active_connections",
|
|
"Number of active connections",
|
|
["service"],
|
|
registry=self.registry,
|
|
)
|
|
|
|
# Dynamic counters for forms service
|
|
self._dynamic_counters: dict[str, Any] = {}
|
|
|
|
self.document_processing_duration = Histogram(
|
|
f"document_processing_duration_seconds_{self.sanitized_name}",
|
|
"Time spent processing documents",
|
|
["service", "document_type"],
|
|
buckets=[0.1, 0.5, 1.0, 2.5, 5.0, 10.0, 30.0, 60.0, 120.0, 300.0],
|
|
registry=self.registry,
|
|
)
|
|
|
|
# Field extraction metrics
|
|
self.field_extractions = Counter(
|
|
f"field_extractions_total_{self.sanitized_name}",
|
|
"Total number of field extractions",
|
|
["service", "field_type", "status"],
|
|
registry=self.registry,
|
|
)
|
|
|
|
self.extraction_confidence = Histogram(
|
|
f"extraction_confidence_score_{self.sanitized_name}",
|
|
"Confidence scores for extractions",
|
|
["service", "extraction_type"],
|
|
buckets=[0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
|
|
registry=self.registry,
|
|
)
|
|
|
|
# Tax calculation metrics
|
|
self.tax_calculations = Counter(
|
|
f"tax_calculations_total_{self.sanitized_name}",
|
|
"Total number of tax calculations",
|
|
["service", "calculation_type", "status"],
|
|
registry=self.registry,
|
|
)
|
|
|
|
self.calculation_confidence = Histogram(
|
|
f"calculation_confidence_score_{self.sanitized_name}",
|
|
"Confidence scores for tax calculations",
|
|
["service", "calculation_type"],
|
|
buckets=[0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
|
|
registry=self.registry,
|
|
)
|
|
|
|
# RAG metrics
|
|
self.rag_searches = Counter(
|
|
f"rag_searches_total_{self.sanitized_name}",
|
|
"Total number of RAG searches",
|
|
["service", "collection", "status"],
|
|
registry=self.registry,
|
|
)
|
|
|
|
self.rag_search_duration = Histogram(
|
|
f"rag_search_duration_seconds_{self.sanitized_name}",
|
|
"Time spent on RAG searches",
|
|
["service", "collection"],
|
|
buckets=[0.01, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0],
|
|
registry=self.registry,
|
|
)
|
|
|
|
self.rag_relevance_score = Histogram(
|
|
f"rag_relevance_score_{self.sanitized_name}",
|
|
"RAG search relevance scores",
|
|
["service", "collection"],
|
|
buckets=[0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
|
|
registry=self.registry,
|
|
)
|
|
|
|
# Knowledge graph metrics
|
|
self.kg_operations = Counter(
|
|
f"kg_operations_total_{self.sanitized_name}",
|
|
"Total number of KG operations",
|
|
["service", "operation", "status"],
|
|
registry=self.registry,
|
|
)
|
|
|
|
self.kg_query_duration = Histogram(
|
|
f"kg_query_duration_seconds_{self.sanitized_name}",
|
|
"Time spent on KG queries",
|
|
["service", "query_type"],
|
|
buckets=[0.01, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0],
|
|
registry=self.registry,
|
|
)
|
|
|
|
# HMRC submission metrics
|
|
self.hmrc_submissions = Counter(
|
|
f"hmrc_submissions_total_{self.sanitized_name}",
|
|
"Total number of HMRC submissions",
|
|
["service", "submission_type", "status"],
|
|
registry=self.registry,
|
|
)
|
|
|
|
# Service health metrics
|
|
self.service_info = Info(
|
|
f"service_info_{self.sanitized_name}",
|
|
"Service information",
|
|
registry=self.registry,
|
|
)
|
|
try:
|
|
self.service_info.info({"service": service_name, "version": "1.0.0"})
|
|
except (AttributeError, ValueError):
|
|
# Handle prometheus_client version compatibility or registry conflicts
|
|
pass
|
|
|
|
def counter(self, name: str, labelnames: list[str] | None = None) -> Any:
|
|
"""Get or create a counter metric with dynamic labels"""
|
|
# Use provided labelnames or default ones
|
|
if labelnames is None:
|
|
labelnames = ["tenant_id", "form_id", "scope", "error_type"]
|
|
|
|
# Create a unique key based on name and labelnames
|
|
label_key = f"{name}_{','.join(sorted(labelnames))}"
|
|
|
|
if label_key not in self._dynamic_counters:
|
|
self._dynamic_counters[label_key] = Counter(
|
|
name,
|
|
f"Dynamic counter: {name}",
|
|
labelnames=labelnames,
|
|
registry=self.registry,
|
|
)
|
|
return self._dynamic_counters[label_key]
|
|
|
|
def histogram(self, name: str, labelnames: list[str] | None = None) -> Any:
|
|
"""Get or create a histogram metric with dynamic labels"""
|
|
# Use provided labelnames or default ones
|
|
if labelnames is None:
|
|
labelnames = ["tenant_id", "kind"]
|
|
|
|
# Create a unique key based on name and labelnames
|
|
label_key = f"{name}_{','.join(sorted(labelnames))}"
|
|
histogram_key = f"_histogram_{label_key}"
|
|
|
|
if not hasattr(self, histogram_key):
|
|
histogram = Histogram(
|
|
name,
|
|
f"Dynamic histogram: {name}",
|
|
labelnames=labelnames,
|
|
registry=self.registry,
|
|
)
|
|
setattr(self, histogram_key, histogram)
|
|
return getattr(self, histogram_key)
|
|
|
|
|
|
def get_business_metrics(service_name: str) -> BusinessMetrics:
|
|
"""Get business metrics instance for service"""
|
|
# Use singleton pattern to avoid registry conflicts
|
|
if service_name not in _business_metrics_registry:
|
|
_business_metrics_registry[service_name] = BusinessMetrics(service_name)
|
|
return _business_metrics_registry[service_name] # type: ignore
|