Initial commit
Some checks failed
CI/CD Pipeline / Code Quality & Linting (push) Has been cancelled
CI/CD Pipeline / Policy Validation (push) Has been cancelled
CI/CD Pipeline / Test Suite (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-firm-connectors) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-forms) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-hmrc) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ingestion) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-normalize-map) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ocr) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-indexer) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-reason) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rpa) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (ui-review) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (ui-review) (push) Has been cancelled
CI/CD Pipeline / Generate SBOM (push) Has been cancelled
CI/CD Pipeline / Deploy to Staging (push) Has been cancelled
CI/CD Pipeline / Deploy to Production (push) Has been cancelled
CI/CD Pipeline / Notifications (push) Has been cancelled

This commit is contained in:
harkon
2025-10-11 08:41:36 +01:00
commit b324ff09ef
276 changed files with 55220 additions and 0 deletions

View File

@@ -0,0 +1,18 @@
"""Observability setup with OpenTelemetry, Prometheus, and structured logging."""
from .logging import configure_logging
from .opentelemetry_setup import init_opentelemetry
from .prometheus import BusinessMetrics, get_business_metrics, init_prometheus_metrics
from .setup import setup_observability
from .utils import get_metrics, get_tracer
__all__ = [
"configure_logging",
"init_opentelemetry",
"init_prometheus_metrics",
"BusinessMetrics",
"get_business_metrics",
"setup_observability",
"get_tracer",
"get_metrics",
]

View File

@@ -0,0 +1,75 @@
"""Structured logging configuration with OpenTelemetry integration."""
import logging
import sys
import time
from typing import Any
import structlog
from opentelemetry import trace
def configure_logging(service_name: str, log_level: str = "INFO") -> None:
"""Configure structured logging with structlog"""
def add_service_name( # pylint: disable=unused-argument
logger: Any,
method_name: str,
event_dict: dict[str, Any], # noqa: ARG001
) -> dict[str, Any]:
event_dict["service"] = service_name
return event_dict
def add_trace_id( # pylint: disable=unused-argument
logger: Any,
method_name: str,
event_dict: dict[str, Any], # noqa: ARG001
) -> dict[str, Any]:
"""Add trace ID to log entries"""
span = trace.get_current_span()
if span and span.get_span_context().is_valid:
event_dict["trace_id"] = format(span.get_span_context().trace_id, "032x")
event_dict["span_id"] = format(span.get_span_context().span_id, "016x")
return event_dict
def add_timestamp( # pylint: disable=unused-argument
logger: Any,
method_name: str,
event_dict: dict[str, Any], # noqa: ARG001
) -> dict[str, Any]:
event_dict["timestamp"] = time.time()
return event_dict
# Configure structlog
structlog.configure(
processors=[
structlog.stdlib.filter_by_level,
structlog.stdlib.add_logger_name,
structlog.stdlib.add_log_level,
add_service_name, # type: ignore
add_trace_id, # type: ignore
add_timestamp, # type: ignore
structlog.stdlib.PositionalArgumentsFormatter(),
structlog.processors.StackInfoRenderer(),
structlog.processors.format_exc_info,
structlog.processors.UnicodeDecoder(),
structlog.processors.JSONRenderer(),
],
context_class=dict,
logger_factory=structlog.stdlib.LoggerFactory(),
wrapper_class=structlog.stdlib.BoundLogger,
cache_logger_on_first_use=True,
)
# Configure standard library logging
logging.basicConfig(
format="%(message)s",
stream=sys.stdout,
level=getattr(logging, log_level.upper()),
)
# Reduce noise from some libraries
logging.getLogger("httpx").setLevel(logging.WARNING)
logging.getLogger("httpcore").setLevel(logging.WARNING)
logging.getLogger("uvicorn.access").setLevel(logging.WARNING)

View File

@@ -0,0 +1,99 @@
"""OpenTelemetry tracing and metrics initialization."""
import os
from typing import Any
from opentelemetry import metrics, trace
from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import OTLPMetricExporter
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
from opentelemetry.instrumentation.httpx import HTTPXClientInstrumentor
from opentelemetry.instrumentation.psycopg2 import Psycopg2Instrumentor
from opentelemetry.instrumentation.redis import RedisInstrumentor
from opentelemetry.sdk.metrics import MeterProvider
from opentelemetry.sdk.metrics.export import (
MetricExporter,
PeriodicExportingMetricReader,
)
from opentelemetry.sdk.resources import Resource
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import BatchSpanProcessor, SpanExporter
def init_opentelemetry(
service_name: str,
service_version: str = "1.0.0",
otlp_endpoint: str | None = None,
) -> tuple[Any, Any]:
"""Initialize OpenTelemetry tracing and metrics"""
# Create resource
resource = Resource.create(
{
"service.name": service_name,
"service.version": service_version,
"service.instance.id": os.getenv("HOSTNAME", "unknown"),
}
)
# Configure tracing
span_exporter: SpanExporter
if otlp_endpoint:
span_exporter = OTLPSpanExporter(endpoint=otlp_endpoint)
span_processor = BatchSpanProcessor(span_exporter)
else:
# Use console exporter for development
try:
# pylint: disable=import-outside-toplevel
from opentelemetry.sdk.trace.export import ConsoleSpanExporter
span_exporter = ConsoleSpanExporter()
except ImportError:
# Fallback to logging exporter
# pylint: disable=import-outside-toplevel
from opentelemetry.sdk.trace.export import ConsoleSpanExporter
span_exporter = ConsoleSpanExporter()
span_processor = BatchSpanProcessor(span_exporter)
tracer_provider = TracerProvider(resource=resource)
tracer_provider.add_span_processor(span_processor)
trace.set_tracer_provider(tracer_provider)
# Configure metrics
metric_exporter: MetricExporter
if otlp_endpoint:
metric_exporter = OTLPMetricExporter(endpoint=otlp_endpoint)
metric_reader = PeriodicExportingMetricReader(
metric_exporter, export_interval_millis=30000
)
else:
# Use console exporter for development
try:
# pylint: disable=import-outside-toplevel
from opentelemetry.sdk.metrics.export import ConsoleMetricExporter
metric_exporter = ConsoleMetricExporter()
except ImportError:
# Fallback to logging exporter
from opentelemetry.sdk.metrics.export import ConsoleMetricExporter
metric_exporter = ConsoleMetricExporter()
metric_reader = PeriodicExportingMetricReader(
metric_exporter, export_interval_millis=30000
)
meter_provider = MeterProvider(resource=resource, metric_readers=[metric_reader])
metrics.set_meter_provider(meter_provider)
# Auto-instrument common libraries
try:
FastAPIInstrumentor().instrument()
HTTPXClientInstrumentor().instrument()
Psycopg2Instrumentor().instrument()
RedisInstrumentor().instrument()
except Exception: # pylint: disable=broad-exception-caught
# Ignore instrumentation errors in tests
pass
return trace.get_tracer(service_name), metrics.get_meter(service_name)

View File

@@ -0,0 +1,235 @@
"""Prometheus metrics setup and business metrics."""
from typing import Any
from prometheus_client import CollectorRegistry, Counter, Gauge, Histogram, Info
from prometheus_fastapi_instrumentator import Instrumentator
def init_prometheus_metrics( # pylint: disable=unused-argument
app: Any, service_name: str
) -> Any:
"""Initialize Prometheus metrics for FastAPI app"""
# Create instrumentator
instrumentator = Instrumentator(
should_group_status_codes=False,
should_ignore_untemplated=True,
should_respect_env_var=True,
should_instrument_requests_inprogress=True,
excluded_handlers=["/metrics", "/healthz", "/readyz", "/livez"],
env_var_name="ENABLE_METRICS",
inprogress_name="http_requests_inprogress",
inprogress_labels=True,
)
# Add custom metrics
instrumentator.add(
lambda info: info.modified_duration < 0.1, # type: ignore
lambda info: Counter(
"http_requests_fast_total",
"Number of fast HTTP requests (< 100ms)",
["method", "endpoint"],
)
.labels(method=info.method, endpoint=info.modified_handler)
.inc(),
)
instrumentator.add(
lambda info: info.modified_duration > 1.0, # type: ignore
lambda info: Counter(
"http_requests_slow_total",
"Number of slow HTTP requests (> 1s)",
["method", "endpoint"],
)
.labels(method=info.method, endpoint=info.modified_handler)
.inc(),
)
# Instrument the app
instrumentator.instrument(app)
instrumentator.expose(app, endpoint="/metrics")
return instrumentator
# Global registry for business metrics to avoid duplicates
_business_metrics_registry: dict[str, Any] = {}
# Custom metrics for business logic
class BusinessMetrics: # pylint: disable=too-many-instance-attributes
"""Custom business metrics for the application"""
def __init__(self, service_name: str):
self.service_name = service_name
# Sanitize service name for Prometheus metrics (replace hyphens with underscores)
self.sanitized_name = service_name.replace("-", "_")
# Create a custom registry for this service to avoid conflicts
self.registry = CollectorRegistry()
# Document processing metrics
self.documents_processed = Counter(
"documents_processed_total",
"Total number of documents processed",
["service", "document_type", "status"],
registry=self.registry,
)
# Add active connections metric for tests
self.active_connections = Gauge(
"active_connections",
"Number of active connections",
["service"],
registry=self.registry,
)
# Dynamic counters for forms service
self._dynamic_counters: dict[str, Any] = {}
self.document_processing_duration = Histogram(
f"document_processing_duration_seconds_{self.sanitized_name}",
"Time spent processing documents",
["service", "document_type"],
buckets=[0.1, 0.5, 1.0, 2.5, 5.0, 10.0, 30.0, 60.0, 120.0, 300.0],
registry=self.registry,
)
# Field extraction metrics
self.field_extractions = Counter(
f"field_extractions_total_{self.sanitized_name}",
"Total number of field extractions",
["service", "field_type", "status"],
registry=self.registry,
)
self.extraction_confidence = Histogram(
f"extraction_confidence_score_{self.sanitized_name}",
"Confidence scores for extractions",
["service", "extraction_type"],
buckets=[0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
registry=self.registry,
)
# Tax calculation metrics
self.tax_calculations = Counter(
f"tax_calculations_total_{self.sanitized_name}",
"Total number of tax calculations",
["service", "calculation_type", "status"],
registry=self.registry,
)
self.calculation_confidence = Histogram(
f"calculation_confidence_score_{self.sanitized_name}",
"Confidence scores for tax calculations",
["service", "calculation_type"],
buckets=[0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
registry=self.registry,
)
# RAG metrics
self.rag_searches = Counter(
f"rag_searches_total_{self.sanitized_name}",
"Total number of RAG searches",
["service", "collection", "status"],
registry=self.registry,
)
self.rag_search_duration = Histogram(
f"rag_search_duration_seconds_{self.sanitized_name}",
"Time spent on RAG searches",
["service", "collection"],
buckets=[0.01, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0],
registry=self.registry,
)
self.rag_relevance_score = Histogram(
f"rag_relevance_score_{self.sanitized_name}",
"RAG search relevance scores",
["service", "collection"],
buckets=[0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
registry=self.registry,
)
# Knowledge graph metrics
self.kg_operations = Counter(
f"kg_operations_total_{self.sanitized_name}",
"Total number of KG operations",
["service", "operation", "status"],
registry=self.registry,
)
self.kg_query_duration = Histogram(
f"kg_query_duration_seconds_{self.sanitized_name}",
"Time spent on KG queries",
["service", "query_type"],
buckets=[0.01, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0],
registry=self.registry,
)
# HMRC submission metrics
self.hmrc_submissions = Counter(
f"hmrc_submissions_total_{self.sanitized_name}",
"Total number of HMRC submissions",
["service", "submission_type", "status"],
registry=self.registry,
)
# Service health metrics
self.service_info = Info(
f"service_info_{self.sanitized_name}",
"Service information",
registry=self.registry,
)
try:
self.service_info.info({"service": service_name, "version": "1.0.0"})
except (AttributeError, ValueError):
# Handle prometheus_client version compatibility or registry conflicts
pass
def counter(self, name: str, labelnames: list[str] | None = None) -> Any:
"""Get or create a counter metric with dynamic labels"""
# Use provided labelnames or default ones
if labelnames is None:
labelnames = ["tenant_id", "form_id", "scope", "error_type"]
# Create a unique key based on name and labelnames
label_key = f"{name}_{','.join(sorted(labelnames))}"
if label_key not in self._dynamic_counters:
self._dynamic_counters[label_key] = Counter(
name,
f"Dynamic counter: {name}",
labelnames=labelnames,
registry=self.registry,
)
return self._dynamic_counters[label_key]
def histogram(self, name: str, labelnames: list[str] | None = None) -> Any:
"""Get or create a histogram metric with dynamic labels"""
# Use provided labelnames or default ones
if labelnames is None:
labelnames = ["tenant_id", "kind"]
# Create a unique key based on name and labelnames
label_key = f"{name}_{','.join(sorted(labelnames))}"
histogram_key = f"_histogram_{label_key}"
if not hasattr(self, histogram_key):
histogram = Histogram(
name,
f"Dynamic histogram: {name}",
labelnames=labelnames,
registry=self.registry,
)
setattr(self, histogram_key, histogram)
return getattr(self, histogram_key)
def get_business_metrics(service_name: str) -> BusinessMetrics:
"""Get business metrics instance for service"""
# Use singleton pattern to avoid registry conflicts
if service_name not in _business_metrics_registry:
_business_metrics_registry[service_name] = BusinessMetrics(service_name)
return _business_metrics_registry[service_name] # type: ignore

View File

@@ -0,0 +1,64 @@
"""Complete observability setup orchestration."""
from typing import Any
from .logging import configure_logging
from .opentelemetry_setup import init_opentelemetry
from .prometheus import get_business_metrics, init_prometheus_metrics
def setup_observability(
settings_or_app: Any,
service_name: str | None = None,
service_version: str = "1.0.0",
log_level: str = "INFO",
otlp_endpoint: str | None = None,
) -> dict[str, Any]:
"""Setup complete observability stack for a service"""
# Handle both settings object and individual parameters
if hasattr(settings_or_app, "service_name"):
# Called with settings object
settings = settings_or_app
service_name = settings.service_name
service_version = getattr(settings, "service_version", "1.0.0")
log_level = getattr(settings, "log_level", "INFO")
otlp_endpoint = getattr(settings, "otel_exporter_endpoint", None)
app = None
else:
# Called with app object
app = settings_or_app
if not service_name:
raise ValueError("service_name is required when passing app object")
# Configure logging
configure_logging(service_name or "unknown", log_level)
# Initialize OpenTelemetry
tracer, meter = init_opentelemetry(
service_name or "unknown", service_version, otlp_endpoint
)
# Get business metrics
business_metrics = get_business_metrics(service_name or "unknown")
# If app is provided, set up Prometheus and add to app state
if app:
# Initialize Prometheus metrics
instrumentator = init_prometheus_metrics(app, service_name or "unknown")
# Add to app state
app.state.tracer = tracer
app.state.meter = meter
app.state.metrics = business_metrics
app.state.instrumentator = instrumentator
return {
"tracer": tracer,
"meter": meter,
"metrics": business_metrics,
"instrumentator": instrumentator,
}
# Just return the observability components
return {"tracer": tracer, "meter": meter, "metrics": business_metrics}

View File

@@ -0,0 +1,17 @@
"""Utility functions for observability components."""
from typing import Any
from opentelemetry import trace
from .prometheus import BusinessMetrics, get_business_metrics
def get_tracer(service_name: str = "default") -> Any:
"""Get OpenTelemetry tracer"""
return trace.get_tracer(service_name)
def get_metrics(service_name: str = "default") -> BusinessMetrics:
"""Get business metrics instance"""
return get_business_metrics(service_name)