Initial commit
Some checks failed
CI/CD Pipeline / Code Quality & Linting (push) Has been cancelled
CI/CD Pipeline / Policy Validation (push) Has been cancelled
CI/CD Pipeline / Test Suite (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-firm-connectors) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-forms) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-hmrc) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ingestion) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-normalize-map) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ocr) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-indexer) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-reason) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rpa) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (ui-review) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (ui-review) (push) Has been cancelled
CI/CD Pipeline / Generate SBOM (push) Has been cancelled
CI/CD Pipeline / Deploy to Staging (push) Has been cancelled
CI/CD Pipeline / Deploy to Production (push) Has been cancelled
CI/CD Pipeline / Notifications (push) Has been cancelled
Some checks failed
CI/CD Pipeline / Code Quality & Linting (push) Has been cancelled
CI/CD Pipeline / Policy Validation (push) Has been cancelled
CI/CD Pipeline / Test Suite (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-firm-connectors) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-forms) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-hmrc) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ingestion) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-normalize-map) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ocr) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-indexer) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-reason) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rpa) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (ui-review) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (ui-review) (push) Has been cancelled
CI/CD Pipeline / Generate SBOM (push) Has been cancelled
CI/CD Pipeline / Deploy to Staging (push) Has been cancelled
CI/CD Pipeline / Deploy to Production (push) Has been cancelled
CI/CD Pipeline / Notifications (push) Has been cancelled
This commit is contained in:
235
libs/observability/prometheus.py
Normal file
235
libs/observability/prometheus.py
Normal file
@@ -0,0 +1,235 @@
|
||||
"""Prometheus metrics setup and business metrics."""
|
||||
|
||||
from typing import Any
|
||||
|
||||
from prometheus_client import CollectorRegistry, Counter, Gauge, Histogram, Info
|
||||
from prometheus_fastapi_instrumentator import Instrumentator
|
||||
|
||||
|
||||
def init_prometheus_metrics( # pylint: disable=unused-argument
|
||||
app: Any, service_name: str
|
||||
) -> Any:
|
||||
"""Initialize Prometheus metrics for FastAPI app"""
|
||||
|
||||
# Create instrumentator
|
||||
instrumentator = Instrumentator(
|
||||
should_group_status_codes=False,
|
||||
should_ignore_untemplated=True,
|
||||
should_respect_env_var=True,
|
||||
should_instrument_requests_inprogress=True,
|
||||
excluded_handlers=["/metrics", "/healthz", "/readyz", "/livez"],
|
||||
env_var_name="ENABLE_METRICS",
|
||||
inprogress_name="http_requests_inprogress",
|
||||
inprogress_labels=True,
|
||||
)
|
||||
|
||||
# Add custom metrics
|
||||
instrumentator.add(
|
||||
lambda info: info.modified_duration < 0.1, # type: ignore
|
||||
lambda info: Counter(
|
||||
"http_requests_fast_total",
|
||||
"Number of fast HTTP requests (< 100ms)",
|
||||
["method", "endpoint"],
|
||||
)
|
||||
.labels(method=info.method, endpoint=info.modified_handler)
|
||||
.inc(),
|
||||
)
|
||||
|
||||
instrumentator.add(
|
||||
lambda info: info.modified_duration > 1.0, # type: ignore
|
||||
lambda info: Counter(
|
||||
"http_requests_slow_total",
|
||||
"Number of slow HTTP requests (> 1s)",
|
||||
["method", "endpoint"],
|
||||
)
|
||||
.labels(method=info.method, endpoint=info.modified_handler)
|
||||
.inc(),
|
||||
)
|
||||
|
||||
# Instrument the app
|
||||
instrumentator.instrument(app)
|
||||
instrumentator.expose(app, endpoint="/metrics")
|
||||
|
||||
return instrumentator
|
||||
|
||||
|
||||
# Global registry for business metrics to avoid duplicates
|
||||
_business_metrics_registry: dict[str, Any] = {}
|
||||
|
||||
|
||||
# Custom metrics for business logic
|
||||
class BusinessMetrics: # pylint: disable=too-many-instance-attributes
|
||||
"""Custom business metrics for the application"""
|
||||
|
||||
def __init__(self, service_name: str):
|
||||
self.service_name = service_name
|
||||
# Sanitize service name for Prometheus metrics (replace hyphens with underscores)
|
||||
self.sanitized_name = service_name.replace("-", "_")
|
||||
|
||||
# Create a custom registry for this service to avoid conflicts
|
||||
self.registry = CollectorRegistry()
|
||||
|
||||
# Document processing metrics
|
||||
self.documents_processed = Counter(
|
||||
"documents_processed_total",
|
||||
"Total number of documents processed",
|
||||
["service", "document_type", "status"],
|
||||
registry=self.registry,
|
||||
)
|
||||
|
||||
# Add active connections metric for tests
|
||||
self.active_connections = Gauge(
|
||||
"active_connections",
|
||||
"Number of active connections",
|
||||
["service"],
|
||||
registry=self.registry,
|
||||
)
|
||||
|
||||
# Dynamic counters for forms service
|
||||
self._dynamic_counters: dict[str, Any] = {}
|
||||
|
||||
self.document_processing_duration = Histogram(
|
||||
f"document_processing_duration_seconds_{self.sanitized_name}",
|
||||
"Time spent processing documents",
|
||||
["service", "document_type"],
|
||||
buckets=[0.1, 0.5, 1.0, 2.5, 5.0, 10.0, 30.0, 60.0, 120.0, 300.0],
|
||||
registry=self.registry,
|
||||
)
|
||||
|
||||
# Field extraction metrics
|
||||
self.field_extractions = Counter(
|
||||
f"field_extractions_total_{self.sanitized_name}",
|
||||
"Total number of field extractions",
|
||||
["service", "field_type", "status"],
|
||||
registry=self.registry,
|
||||
)
|
||||
|
||||
self.extraction_confidence = Histogram(
|
||||
f"extraction_confidence_score_{self.sanitized_name}",
|
||||
"Confidence scores for extractions",
|
||||
["service", "extraction_type"],
|
||||
buckets=[0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
|
||||
registry=self.registry,
|
||||
)
|
||||
|
||||
# Tax calculation metrics
|
||||
self.tax_calculations = Counter(
|
||||
f"tax_calculations_total_{self.sanitized_name}",
|
||||
"Total number of tax calculations",
|
||||
["service", "calculation_type", "status"],
|
||||
registry=self.registry,
|
||||
)
|
||||
|
||||
self.calculation_confidence = Histogram(
|
||||
f"calculation_confidence_score_{self.sanitized_name}",
|
||||
"Confidence scores for tax calculations",
|
||||
["service", "calculation_type"],
|
||||
buckets=[0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
|
||||
registry=self.registry,
|
||||
)
|
||||
|
||||
# RAG metrics
|
||||
self.rag_searches = Counter(
|
||||
f"rag_searches_total_{self.sanitized_name}",
|
||||
"Total number of RAG searches",
|
||||
["service", "collection", "status"],
|
||||
registry=self.registry,
|
||||
)
|
||||
|
||||
self.rag_search_duration = Histogram(
|
||||
f"rag_search_duration_seconds_{self.sanitized_name}",
|
||||
"Time spent on RAG searches",
|
||||
["service", "collection"],
|
||||
buckets=[0.01, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0],
|
||||
registry=self.registry,
|
||||
)
|
||||
|
||||
self.rag_relevance_score = Histogram(
|
||||
f"rag_relevance_score_{self.sanitized_name}",
|
||||
"RAG search relevance scores",
|
||||
["service", "collection"],
|
||||
buckets=[0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
|
||||
registry=self.registry,
|
||||
)
|
||||
|
||||
# Knowledge graph metrics
|
||||
self.kg_operations = Counter(
|
||||
f"kg_operations_total_{self.sanitized_name}",
|
||||
"Total number of KG operations",
|
||||
["service", "operation", "status"],
|
||||
registry=self.registry,
|
||||
)
|
||||
|
||||
self.kg_query_duration = Histogram(
|
||||
f"kg_query_duration_seconds_{self.sanitized_name}",
|
||||
"Time spent on KG queries",
|
||||
["service", "query_type"],
|
||||
buckets=[0.01, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0],
|
||||
registry=self.registry,
|
||||
)
|
||||
|
||||
# HMRC submission metrics
|
||||
self.hmrc_submissions = Counter(
|
||||
f"hmrc_submissions_total_{self.sanitized_name}",
|
||||
"Total number of HMRC submissions",
|
||||
["service", "submission_type", "status"],
|
||||
registry=self.registry,
|
||||
)
|
||||
|
||||
# Service health metrics
|
||||
self.service_info = Info(
|
||||
f"service_info_{self.sanitized_name}",
|
||||
"Service information",
|
||||
registry=self.registry,
|
||||
)
|
||||
try:
|
||||
self.service_info.info({"service": service_name, "version": "1.0.0"})
|
||||
except (AttributeError, ValueError):
|
||||
# Handle prometheus_client version compatibility or registry conflicts
|
||||
pass
|
||||
|
||||
def counter(self, name: str, labelnames: list[str] | None = None) -> Any:
|
||||
"""Get or create a counter metric with dynamic labels"""
|
||||
# Use provided labelnames or default ones
|
||||
if labelnames is None:
|
||||
labelnames = ["tenant_id", "form_id", "scope", "error_type"]
|
||||
|
||||
# Create a unique key based on name and labelnames
|
||||
label_key = f"{name}_{','.join(sorted(labelnames))}"
|
||||
|
||||
if label_key not in self._dynamic_counters:
|
||||
self._dynamic_counters[label_key] = Counter(
|
||||
name,
|
||||
f"Dynamic counter: {name}",
|
||||
labelnames=labelnames,
|
||||
registry=self.registry,
|
||||
)
|
||||
return self._dynamic_counters[label_key]
|
||||
|
||||
def histogram(self, name: str, labelnames: list[str] | None = None) -> Any:
|
||||
"""Get or create a histogram metric with dynamic labels"""
|
||||
# Use provided labelnames or default ones
|
||||
if labelnames is None:
|
||||
labelnames = ["tenant_id", "kind"]
|
||||
|
||||
# Create a unique key based on name and labelnames
|
||||
label_key = f"{name}_{','.join(sorted(labelnames))}"
|
||||
histogram_key = f"_histogram_{label_key}"
|
||||
|
||||
if not hasattr(self, histogram_key):
|
||||
histogram = Histogram(
|
||||
name,
|
||||
f"Dynamic histogram: {name}",
|
||||
labelnames=labelnames,
|
||||
registry=self.registry,
|
||||
)
|
||||
setattr(self, histogram_key, histogram)
|
||||
return getattr(self, histogram_key)
|
||||
|
||||
|
||||
def get_business_metrics(service_name: str) -> BusinessMetrics:
|
||||
"""Get business metrics instance for service"""
|
||||
# Use singleton pattern to avoid registry conflicts
|
||||
if service_name not in _business_metrics_registry:
|
||||
_business_metrics_registry[service_name] = BusinessMetrics(service_name)
|
||||
return _business_metrics_registry[service_name] # type: ignore
|
||||
Reference in New Issue
Block a user