"""Prometheus metrics for event bus monitoring.""" from prometheus_client import Counter, Histogram from prometheus_client.registry import CollectorRegistry # Global registry for event metrics _event_registry = CollectorRegistry() # Event publishing metrics event_published_total = Counter( "event_published_total", "Total number of events published", ["topic"], registry=_event_registry, ) event_publish_errors_total = Counter( "event_publish_errors_total", "Total number of event publishing errors", ["topic", "error_type"], registry=_event_registry, ) event_publishing_duration_seconds = Histogram( "event_publishing_duration_seconds", "Time spent publishing events in seconds", ["topic"], buckets=(0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0), registry=_event_registry, ) # Event consumption metrics event_consumed_total = Counter( "event_consumed_total", "Total number of events consumed", ["topic", "consumer_group"], registry=_event_registry, ) event_processing_duration_seconds = Histogram( "event_processing_duration_seconds", "Time spent processing events in seconds", ["topic", "consumer_group"], buckets=(0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 30.0, 60.0), registry=_event_registry, ) event_processing_errors_total = Counter( "event_processing_errors_total", "Total number of event processing errors", ["topic", "consumer_group", "error_type"], registry=_event_registry, ) # DLQ metrics event_dlq_total = Counter( "event_dlq_total", "Total number of events sent to dead letter queue", ["topic", "error_type"], registry=_event_registry, ) event_retry_total = Counter( "event_retry_total", "Total number of event retry attempts", ["topic", "retry_attempt"], registry=_event_registry, ) # Schema validation metrics event_schema_validation_errors_total = Counter( "event_schema_validation_errors_total", "Total number of event schema validation errors", ["topic", "validation_error"], registry=_event_registry, ) # NATS JetStream specific metrics nats_stream_messages_total = Counter( "nats_stream_messages_total", "Total messages in NATS stream", ["stream_name"], registry=_event_registry, ) nats_consumer_lag_messages = Histogram( "nats_consumer_lag_messages", "Number of messages consumer is lagging behind", ["stream_name", "consumer_group"], buckets=(0, 1, 5, 10, 25, 50, 100, 250, 500, 1000, 5000, 10000), registry=_event_registry, ) def get_event_metrics_registry() -> CollectorRegistry: """ Get the Prometheus registry for event metrics. Returns: CollectorRegistry for event metrics """ return _event_registry class EventMetricsCollector: """Helper class for collecting event metrics.""" @staticmethod def record_publish( topic: str, duration_seconds: float, success: bool = True, error_type: str | None = None, ) -> None: """ Record event publishing metrics. Args: topic: Event topic name duration_seconds: Time taken to publish success: Whether publishing succeeded error_type: Type of error if failed """ if success: event_published_total.labels(topic=topic).inc() else: event_publish_errors_total.labels( topic=topic, error_type=error_type or "unknown" ).inc() event_publishing_duration_seconds.labels(topic=topic).observe(duration_seconds) @staticmethod def record_consume( topic: str, consumer_group: str, duration_seconds: float, success: bool = True, error_type: str | None = None, ) -> None: """ Record event consumption metrics. Args: topic: Event topic name consumer_group: Consumer group name duration_seconds: Time taken to process event success: Whether processing succeeded error_type: Type of error if failed """ if success: event_consumed_total.labels( topic=topic, consumer_group=consumer_group ).inc() else: event_processing_errors_total.labels( topic=topic, consumer_group=consumer_group, error_type=error_type or "unknown", ).inc() event_processing_duration_seconds.labels( topic=topic, consumer_group=consumer_group ).observe(duration_seconds) @staticmethod def record_dlq(topic: str, error_type: str) -> None: """ Record event sent to DLQ. Args: topic: Event topic name error_type: Type of error that caused DLQ """ event_dlq_total.labels(topic=topic, error_type=error_type).inc() @staticmethod def record_retry(topic: str, retry_attempt: int) -> None: """ Record event retry attempt. Args: topic: Event topic name retry_attempt: Retry attempt number (1-indexed) """ event_retry_total.labels(topic=topic, retry_attempt=str(retry_attempt)).inc() @staticmethod def record_schema_validation_error(topic: str, validation_error: str) -> None: """ Record schema validation error. Args: topic: Event topic name validation_error: Type of validation error """ event_schema_validation_errors_total.labels( topic=topic, validation_error=validation_error ).inc() @staticmethod def record_nats_stream_message(stream_name: str) -> None: """ Record message added to NATS stream. Args: stream_name: NATS stream name """ nats_stream_messages_total.labels(stream_name=stream_name).inc() @staticmethod def record_consumer_lag( stream_name: str, consumer_group: str, lag_messages: int ) -> None: """ Record consumer lag. Args: stream_name: NATS stream name consumer_group: Consumer group name lag_messages: Number of messages consumer is behind """ nats_consumer_lag_messages.labels( stream_name=stream_name, consumer_group=consumer_group ).observe(lag_messages)