Files
ai-tax-agent/libs/events/metrics.py
harkon fdba81809f
Some checks failed
CI/CD Pipeline / Generate SBOM (push) Has been cancelled
CI/CD Pipeline / Deploy to Staging (push) Has been cancelled
CI/CD Pipeline / Deploy to Production (push) Has been cancelled
CI/CD Pipeline / Code Quality & Linting (push) Has been cancelled
CI/CD Pipeline / Policy Validation (push) Has been cancelled
CI/CD Pipeline / Test Suite (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-firm-connectors) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-forms) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-hmrc) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ingestion) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-normalize-map) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ocr) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-indexer) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-reason) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rpa) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (ui-review) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (ui-review) (push) Has been cancelled
CI/CD Pipeline / Notifications (push) Has been cancelled
completed local setup with compose
2025-11-26 13:17:17 +00:00

226 lines
6.3 KiB
Python

"""Prometheus metrics for event bus monitoring."""
from prometheus_client import Counter, Histogram
from prometheus_client.registry import CollectorRegistry
# Global registry for event metrics
_event_registry = CollectorRegistry()
# Event publishing metrics
event_published_total = Counter(
"event_published_total",
"Total number of events published",
["topic"],
registry=_event_registry,
)
event_publish_errors_total = Counter(
"event_publish_errors_total",
"Total number of event publishing errors",
["topic", "error_type"],
registry=_event_registry,
)
event_publishing_duration_seconds = Histogram(
"event_publishing_duration_seconds",
"Time spent publishing events in seconds",
["topic"],
buckets=(0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0),
registry=_event_registry,
)
# Event consumption metrics
event_consumed_total = Counter(
"event_consumed_total",
"Total number of events consumed",
["topic", "consumer_group"],
registry=_event_registry,
)
event_processing_duration_seconds = Histogram(
"event_processing_duration_seconds",
"Time spent processing events in seconds",
["topic", "consumer_group"],
buckets=(0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 30.0, 60.0),
registry=_event_registry,
)
event_processing_errors_total = Counter(
"event_processing_errors_total",
"Total number of event processing errors",
["topic", "consumer_group", "error_type"],
registry=_event_registry,
)
# DLQ metrics
event_dlq_total = Counter(
"event_dlq_total",
"Total number of events sent to dead letter queue",
["topic", "error_type"],
registry=_event_registry,
)
event_retry_total = Counter(
"event_retry_total",
"Total number of event retry attempts",
["topic", "retry_attempt"],
registry=_event_registry,
)
# Schema validation metrics
event_schema_validation_errors_total = Counter(
"event_schema_validation_errors_total",
"Total number of event schema validation errors",
["topic", "validation_error"],
registry=_event_registry,
)
# NATS JetStream specific metrics
nats_stream_messages_total = Counter(
"nats_stream_messages_total",
"Total messages in NATS stream",
["stream_name"],
registry=_event_registry,
)
nats_consumer_lag_messages = Histogram(
"nats_consumer_lag_messages",
"Number of messages consumer is lagging behind",
["stream_name", "consumer_group"],
buckets=(0, 1, 5, 10, 25, 50, 100, 250, 500, 1000, 5000, 10000),
registry=_event_registry,
)
def get_event_metrics_registry() -> CollectorRegistry:
"""
Get the Prometheus registry for event metrics.
Returns:
CollectorRegistry for event metrics
"""
return _event_registry
class EventMetricsCollector:
"""Helper class for collecting event metrics."""
@staticmethod
def record_publish(
topic: str,
duration_seconds: float,
success: bool = True,
error_type: str | None = None,
) -> None:
"""
Record event publishing metrics.
Args:
topic: Event topic name
duration_seconds: Time taken to publish
success: Whether publishing succeeded
error_type: Type of error if failed
"""
if success:
event_published_total.labels(topic=topic).inc()
else:
event_publish_errors_total.labels(
topic=topic, error_type=error_type or "unknown"
).inc()
event_publishing_duration_seconds.labels(topic=topic).observe(duration_seconds)
@staticmethod
def record_consume(
topic: str,
consumer_group: str,
duration_seconds: float,
success: bool = True,
error_type: str | None = None,
) -> None:
"""
Record event consumption metrics.
Args:
topic: Event topic name
consumer_group: Consumer group name
duration_seconds: Time taken to process event
success: Whether processing succeeded
error_type: Type of error if failed
"""
if success:
event_consumed_total.labels(
topic=topic, consumer_group=consumer_group
).inc()
else:
event_processing_errors_total.labels(
topic=topic,
consumer_group=consumer_group,
error_type=error_type or "unknown",
).inc()
event_processing_duration_seconds.labels(
topic=topic, consumer_group=consumer_group
).observe(duration_seconds)
@staticmethod
def record_dlq(topic: str, error_type: str) -> None:
"""
Record event sent to DLQ.
Args:
topic: Event topic name
error_type: Type of error that caused DLQ
"""
event_dlq_total.labels(topic=topic, error_type=error_type).inc()
@staticmethod
def record_retry(topic: str, retry_attempt: int) -> None:
"""
Record event retry attempt.
Args:
topic: Event topic name
retry_attempt: Retry attempt number (1-indexed)
"""
event_retry_total.labels(topic=topic, retry_attempt=str(retry_attempt)).inc()
@staticmethod
def record_schema_validation_error(topic: str, validation_error: str) -> None:
"""
Record schema validation error.
Args:
topic: Event topic name
validation_error: Type of validation error
"""
event_schema_validation_errors_total.labels(
topic=topic, validation_error=validation_error
).inc()
@staticmethod
def record_nats_stream_message(stream_name: str) -> None:
"""
Record message added to NATS stream.
Args:
stream_name: NATS stream name
"""
nats_stream_messages_total.labels(stream_name=stream_name).inc()
@staticmethod
def record_consumer_lag(
stream_name: str, consumer_group: str, lag_messages: int
) -> None:
"""
Record consumer lag.
Args:
stream_name: NATS stream name
consumer_group: Consumer group name
lag_messages: Number of messages consumer is behind
"""
nats_consumer_lag_messages.labels(
stream_name=stream_name, consumer_group=consumer_group
).observe(lag_messages)