Some checks failed
CI/CD Pipeline / Generate SBOM (push) Has been cancelled
CI/CD Pipeline / Deploy to Staging (push) Has been cancelled
CI/CD Pipeline / Deploy to Production (push) Has been cancelled
CI/CD Pipeline / Code Quality & Linting (push) Has been cancelled
CI/CD Pipeline / Policy Validation (push) Has been cancelled
CI/CD Pipeline / Test Suite (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-firm-connectors) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-forms) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-hmrc) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ingestion) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-normalize-map) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ocr) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-indexer) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-reason) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rpa) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (ui-review) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (ui-review) (push) Has been cancelled
CI/CD Pipeline / Notifications (push) Has been cancelled
226 lines
6.3 KiB
Python
226 lines
6.3 KiB
Python
"""Prometheus metrics for event bus monitoring."""
|
|
|
|
from prometheus_client import Counter, Histogram
|
|
from prometheus_client.registry import CollectorRegistry
|
|
|
|
# Global registry for event metrics
|
|
_event_registry = CollectorRegistry()
|
|
|
|
# Event publishing metrics
|
|
event_published_total = Counter(
|
|
"event_published_total",
|
|
"Total number of events published",
|
|
["topic"],
|
|
registry=_event_registry,
|
|
)
|
|
|
|
event_publish_errors_total = Counter(
|
|
"event_publish_errors_total",
|
|
"Total number of event publishing errors",
|
|
["topic", "error_type"],
|
|
registry=_event_registry,
|
|
)
|
|
|
|
event_publishing_duration_seconds = Histogram(
|
|
"event_publishing_duration_seconds",
|
|
"Time spent publishing events in seconds",
|
|
["topic"],
|
|
buckets=(0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0),
|
|
registry=_event_registry,
|
|
)
|
|
|
|
# Event consumption metrics
|
|
event_consumed_total = Counter(
|
|
"event_consumed_total",
|
|
"Total number of events consumed",
|
|
["topic", "consumer_group"],
|
|
registry=_event_registry,
|
|
)
|
|
|
|
event_processing_duration_seconds = Histogram(
|
|
"event_processing_duration_seconds",
|
|
"Time spent processing events in seconds",
|
|
["topic", "consumer_group"],
|
|
buckets=(0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 30.0, 60.0),
|
|
registry=_event_registry,
|
|
)
|
|
|
|
event_processing_errors_total = Counter(
|
|
"event_processing_errors_total",
|
|
"Total number of event processing errors",
|
|
["topic", "consumer_group", "error_type"],
|
|
registry=_event_registry,
|
|
)
|
|
|
|
# DLQ metrics
|
|
event_dlq_total = Counter(
|
|
"event_dlq_total",
|
|
"Total number of events sent to dead letter queue",
|
|
["topic", "error_type"],
|
|
registry=_event_registry,
|
|
)
|
|
|
|
event_retry_total = Counter(
|
|
"event_retry_total",
|
|
"Total number of event retry attempts",
|
|
["topic", "retry_attempt"],
|
|
registry=_event_registry,
|
|
)
|
|
|
|
# Schema validation metrics
|
|
event_schema_validation_errors_total = Counter(
|
|
"event_schema_validation_errors_total",
|
|
"Total number of event schema validation errors",
|
|
["topic", "validation_error"],
|
|
registry=_event_registry,
|
|
)
|
|
|
|
# NATS JetStream specific metrics
|
|
nats_stream_messages_total = Counter(
|
|
"nats_stream_messages_total",
|
|
"Total messages in NATS stream",
|
|
["stream_name"],
|
|
registry=_event_registry,
|
|
)
|
|
|
|
nats_consumer_lag_messages = Histogram(
|
|
"nats_consumer_lag_messages",
|
|
"Number of messages consumer is lagging behind",
|
|
["stream_name", "consumer_group"],
|
|
buckets=(0, 1, 5, 10, 25, 50, 100, 250, 500, 1000, 5000, 10000),
|
|
registry=_event_registry,
|
|
)
|
|
|
|
|
|
def get_event_metrics_registry() -> CollectorRegistry:
|
|
"""
|
|
Get the Prometheus registry for event metrics.
|
|
|
|
Returns:
|
|
CollectorRegistry for event metrics
|
|
"""
|
|
return _event_registry
|
|
|
|
|
|
class EventMetricsCollector:
|
|
"""Helper class for collecting event metrics."""
|
|
|
|
@staticmethod
|
|
def record_publish(
|
|
topic: str,
|
|
duration_seconds: float,
|
|
success: bool = True,
|
|
error_type: str | None = None,
|
|
) -> None:
|
|
"""
|
|
Record event publishing metrics.
|
|
|
|
Args:
|
|
topic: Event topic name
|
|
duration_seconds: Time taken to publish
|
|
success: Whether publishing succeeded
|
|
error_type: Type of error if failed
|
|
"""
|
|
if success:
|
|
event_published_total.labels(topic=topic).inc()
|
|
else:
|
|
event_publish_errors_total.labels(
|
|
topic=topic, error_type=error_type or "unknown"
|
|
).inc()
|
|
|
|
event_publishing_duration_seconds.labels(topic=topic).observe(duration_seconds)
|
|
|
|
@staticmethod
|
|
def record_consume(
|
|
topic: str,
|
|
consumer_group: str,
|
|
duration_seconds: float,
|
|
success: bool = True,
|
|
error_type: str | None = None,
|
|
) -> None:
|
|
"""
|
|
Record event consumption metrics.
|
|
|
|
Args:
|
|
topic: Event topic name
|
|
consumer_group: Consumer group name
|
|
duration_seconds: Time taken to process event
|
|
success: Whether processing succeeded
|
|
error_type: Type of error if failed
|
|
"""
|
|
if success:
|
|
event_consumed_total.labels(
|
|
topic=topic, consumer_group=consumer_group
|
|
).inc()
|
|
else:
|
|
event_processing_errors_total.labels(
|
|
topic=topic,
|
|
consumer_group=consumer_group,
|
|
error_type=error_type or "unknown",
|
|
).inc()
|
|
|
|
event_processing_duration_seconds.labels(
|
|
topic=topic, consumer_group=consumer_group
|
|
).observe(duration_seconds)
|
|
|
|
@staticmethod
|
|
def record_dlq(topic: str, error_type: str) -> None:
|
|
"""
|
|
Record event sent to DLQ.
|
|
|
|
Args:
|
|
topic: Event topic name
|
|
error_type: Type of error that caused DLQ
|
|
"""
|
|
event_dlq_total.labels(topic=topic, error_type=error_type).inc()
|
|
|
|
@staticmethod
|
|
def record_retry(topic: str, retry_attempt: int) -> None:
|
|
"""
|
|
Record event retry attempt.
|
|
|
|
Args:
|
|
topic: Event topic name
|
|
retry_attempt: Retry attempt number (1-indexed)
|
|
"""
|
|
event_retry_total.labels(topic=topic, retry_attempt=str(retry_attempt)).inc()
|
|
|
|
@staticmethod
|
|
def record_schema_validation_error(topic: str, validation_error: str) -> None:
|
|
"""
|
|
Record schema validation error.
|
|
|
|
Args:
|
|
topic: Event topic name
|
|
validation_error: Type of validation error
|
|
"""
|
|
event_schema_validation_errors_total.labels(
|
|
topic=topic, validation_error=validation_error
|
|
).inc()
|
|
|
|
@staticmethod
|
|
def record_nats_stream_message(stream_name: str) -> None:
|
|
"""
|
|
Record message added to NATS stream.
|
|
|
|
Args:
|
|
stream_name: NATS stream name
|
|
"""
|
|
nats_stream_messages_total.labels(stream_name=stream_name).inc()
|
|
|
|
@staticmethod
|
|
def record_consumer_lag(
|
|
stream_name: str, consumer_group: str, lag_messages: int
|
|
) -> None:
|
|
"""
|
|
Record consumer lag.
|
|
|
|
Args:
|
|
stream_name: NATS stream name
|
|
consumer_group: Consumer group name
|
|
lag_messages: Number of messages consumer is behind
|
|
"""
|
|
nats_consumer_lag_messages.labels(
|
|
stream_name=stream_name, consumer_group=consumer_group
|
|
).observe(lag_messages)
|