completed local setup with compose
Some checks failed
CI/CD Pipeline / Generate SBOM (push) Has been cancelled
CI/CD Pipeline / Deploy to Staging (push) Has been cancelled
CI/CD Pipeline / Deploy to Production (push) Has been cancelled
CI/CD Pipeline / Code Quality & Linting (push) Has been cancelled
CI/CD Pipeline / Policy Validation (push) Has been cancelled
CI/CD Pipeline / Test Suite (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-firm-connectors) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-forms) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-hmrc) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ingestion) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-normalize-map) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ocr) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-indexer) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-reason) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rpa) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (ui-review) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (ui-review) (push) Has been cancelled
CI/CD Pipeline / Notifications (push) Has been cancelled
Some checks failed
CI/CD Pipeline / Generate SBOM (push) Has been cancelled
CI/CD Pipeline / Deploy to Staging (push) Has been cancelled
CI/CD Pipeline / Deploy to Production (push) Has been cancelled
CI/CD Pipeline / Code Quality & Linting (push) Has been cancelled
CI/CD Pipeline / Policy Validation (push) Has been cancelled
CI/CD Pipeline / Test Suite (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-firm-connectors) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-forms) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-hmrc) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ingestion) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-normalize-map) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ocr) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-indexer) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-reason) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rpa) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (ui-review) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (ui-review) (push) Has been cancelled
CI/CD Pipeline / Notifications (push) Has been cancelled
This commit is contained in:
@@ -2,6 +2,7 @@
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import time
|
||||
from collections.abc import Awaitable, Callable
|
||||
from typing import Any
|
||||
|
||||
@@ -12,6 +13,8 @@ from nats.js import JetStreamContext
|
||||
from nats.js.api import AckPolicy, ConsumerConfig, DeliverPolicy
|
||||
|
||||
from .base import EventBus, EventPayload
|
||||
from .dlq import DLQHandler
|
||||
from .metrics import EventMetricsCollector
|
||||
|
||||
logger = structlog.get_logger()
|
||||
|
||||
@@ -24,6 +27,8 @@ class NATSEventBus(EventBus): # pylint: disable=too-many-instance-attributes
|
||||
servers: str | list[str] = "nats://localhost:4222",
|
||||
stream_name: str = "TAX_AGENT_EVENTS",
|
||||
consumer_group: str = "tax-agent",
|
||||
dlq_stream_name: str = "TAX_AGENT_DLQ",
|
||||
max_retries: int = 3,
|
||||
):
|
||||
if isinstance(servers, str):
|
||||
self.servers = [servers]
|
||||
@@ -32,8 +37,13 @@ class NATSEventBus(EventBus): # pylint: disable=too-many-instance-attributes
|
||||
|
||||
self.stream_name = stream_name
|
||||
self.consumer_group = consumer_group
|
||||
self.dlq_stream_name = dlq_stream_name
|
||||
self.max_retries = max_retries
|
||||
|
||||
self.nc: NATS | None = None
|
||||
self.js: JetStreamContext | None = None
|
||||
self.dlq: DLQHandler | None = None
|
||||
|
||||
self.handlers: dict[
|
||||
str, list[Callable[[str, EventPayload], Awaitable[None]]]
|
||||
] = {}
|
||||
@@ -48,19 +58,32 @@ class NATSEventBus(EventBus): # pylint: disable=too-many-instance-attributes
|
||||
|
||||
try:
|
||||
# Connect to NATS
|
||||
self.nc = await nats.connect(servers=self.servers)
|
||||
self.nc = await nats.connect(
|
||||
servers=self.servers,
|
||||
connect_timeout=10,
|
||||
reconnect_time_wait=1,
|
||||
)
|
||||
|
||||
# Get JetStream context
|
||||
self.js = self.nc.jetstream()
|
||||
self.js = self.nc.jetstream(timeout=10)
|
||||
|
||||
# Ensure stream exists
|
||||
# Initialize DLQ handler
|
||||
self.dlq = DLQHandler(
|
||||
js=self.js,
|
||||
dlq_stream_name=self.dlq_stream_name,
|
||||
max_retries=self.max_retries,
|
||||
)
|
||||
|
||||
# Ensure streams exist
|
||||
await self._ensure_stream_exists()
|
||||
await self.dlq.ensure_dlq_stream_exists()
|
||||
|
||||
self.running = True
|
||||
logger.info(
|
||||
"NATS event bus started",
|
||||
servers=self.servers,
|
||||
stream=self.stream_name,
|
||||
dlq_stream=self.dlq_stream_name,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
@@ -98,6 +121,7 @@ class NATSEventBus(EventBus): # pylint: disable=too-many-instance-attributes
|
||||
if not self.js:
|
||||
raise RuntimeError("Event bus not started")
|
||||
|
||||
start_time = time.perf_counter()
|
||||
try:
|
||||
# Create subject name from topic
|
||||
subject = f"{self.stream_name}.{topic}"
|
||||
@@ -117,6 +141,13 @@ class NATSEventBus(EventBus): # pylint: disable=too-many-instance-attributes
|
||||
headers=headers,
|
||||
)
|
||||
|
||||
duration = time.perf_counter() - start_time
|
||||
EventMetricsCollector.record_publish(
|
||||
topic=topic,
|
||||
duration_seconds=duration,
|
||||
success=True,
|
||||
)
|
||||
|
||||
logger.info(
|
||||
"Event published",
|
||||
topic=topic,
|
||||
@@ -127,6 +158,14 @@ class NATSEventBus(EventBus): # pylint: disable=too-many-instance-attributes
|
||||
return True
|
||||
|
||||
except Exception as e: # pylint: disable=broad-exception-caught
|
||||
duration = time.perf_counter() - start_time
|
||||
EventMetricsCollector.record_publish(
|
||||
topic=topic,
|
||||
duration_seconds=duration,
|
||||
success=False,
|
||||
error_type=type(e).__name__,
|
||||
)
|
||||
|
||||
logger.error(
|
||||
"Failed to publish event",
|
||||
topic=topic,
|
||||
@@ -152,9 +191,13 @@ class NATSEventBus(EventBus): # pylint: disable=too-many-instance-attributes
|
||||
subject = f"{self.stream_name}.{topic}"
|
||||
|
||||
# Create durable consumer
|
||||
consumer_name = f"{self.consumer_group}-{topic}"
|
||||
# Durable names cannot contain dots, so we replace them
|
||||
safe_topic = topic.replace(".", "-")
|
||||
consumer_name = f"{self.consumer_group}-{safe_topic}"
|
||||
|
||||
# Subscribe with pull-based consumer
|
||||
# Set max_deliver to max_retries + 1 (initial + retries)
|
||||
# We handle DLQ manually before NATS gives up
|
||||
subscription = await self.js.pull_subscribe(
|
||||
subject=subject,
|
||||
durable=consumer_name,
|
||||
@@ -162,7 +205,7 @@ class NATSEventBus(EventBus): # pylint: disable=too-many-instance-attributes
|
||||
durable_name=consumer_name,
|
||||
ack_policy=AckPolicy.EXPLICIT,
|
||||
deliver_policy=DeliverPolicy.NEW,
|
||||
max_deliver=3,
|
||||
max_deliver=self.max_retries + 2, # Give us room to handle DLQ
|
||||
ack_wait=30, # 30 seconds
|
||||
),
|
||||
)
|
||||
@@ -193,13 +236,14 @@ class NATSEventBus(EventBus): # pylint: disable=too-many-instance-attributes
|
||||
# Try to get stream info
|
||||
await self.js.stream_info(self.stream_name)
|
||||
logger.debug("Stream already exists", stream=self.stream_name)
|
||||
EventMetricsCollector.record_nats_stream_message(self.stream_name)
|
||||
|
||||
except Exception:
|
||||
# Stream doesn't exist, create it
|
||||
try:
|
||||
await self.js.add_stream(
|
||||
name=self.stream_name,
|
||||
subjects=[f"{self.stream_name}.*"],
|
||||
subjects=[f"{self.stream_name}.>"],
|
||||
)
|
||||
logger.info("Created JetStream stream", stream=self.stream_name)
|
||||
|
||||
@@ -214,12 +258,17 @@ class NATSEventBus(EventBus): # pylint: disable=too-many-instance-attributes
|
||||
while self.running:
|
||||
try:
|
||||
# Fetch messages in batches
|
||||
messages = await subscription.fetch(batch=10, timeout=20)
|
||||
messages = await subscription.fetch(batch=10, timeout=5)
|
||||
|
||||
for message in messages:
|
||||
start_time = time.perf_counter()
|
||||
payload = None
|
||||
|
||||
try:
|
||||
print(f"DEBUG: Received message: {message.data}")
|
||||
# Parse message payload
|
||||
payload_dict = json.loads(message.data.decode())
|
||||
print(f"DEBUG: Parsed payload: {payload_dict}")
|
||||
|
||||
payload = EventPayload(
|
||||
data=payload_dict["data"],
|
||||
@@ -230,38 +279,87 @@ class NATSEventBus(EventBus): # pylint: disable=too-many-instance-attributes
|
||||
)
|
||||
payload.event_id = payload_dict["event_id"]
|
||||
payload.occurred_at = payload_dict["occurred_at"]
|
||||
print(f"DEBUG: Reconstructed payload: {payload.event_id}")
|
||||
|
||||
# Call all handlers for this topic
|
||||
for handler in self.handlers.get(topic, []):
|
||||
try:
|
||||
await handler(topic, payload)
|
||||
except (
|
||||
Exception
|
||||
) as e: # pylint: disable=broad-exception-caught
|
||||
logger.error(
|
||||
"Handler failed",
|
||||
topic=topic,
|
||||
event_id=payload.event_id,
|
||||
error=str(e),
|
||||
)
|
||||
print(f"DEBUG: Calling handler for topic {topic}")
|
||||
await handler(topic, payload)
|
||||
|
||||
# Acknowledge message
|
||||
await message.ack()
|
||||
print("DEBUG: Message acked")
|
||||
|
||||
except json.JSONDecodeError as e:
|
||||
logger.error(
|
||||
"Failed to decode message", topic=topic, error=str(e)
|
||||
# Record metrics
|
||||
duration = time.perf_counter() - start_time
|
||||
EventMetricsCollector.record_consume(
|
||||
topic=topic,
|
||||
consumer_group=self.consumer_group,
|
||||
duration_seconds=duration,
|
||||
success=True,
|
||||
)
|
||||
await message.nak()
|
||||
|
||||
except Exception as e: # pylint: disable=broad-exception-caught
|
||||
logger.error(
|
||||
"Failed to process message", topic=topic, error=str(e)
|
||||
duration = time.perf_counter() - start_time
|
||||
error_type = type(e).__name__
|
||||
|
||||
# Record failure metric
|
||||
EventMetricsCollector.record_consume(
|
||||
topic=topic,
|
||||
consumer_group=self.consumer_group,
|
||||
duration_seconds=duration,
|
||||
success=False,
|
||||
error_type=error_type,
|
||||
)
|
||||
await message.nak()
|
||||
|
||||
# Check delivery count for DLQ
|
||||
try:
|
||||
metadata = message.metadata
|
||||
num_delivered = (
|
||||
metadata.sequence.consumer
|
||||
) # This might be wrong, check docs
|
||||
# Actually nats-py MsgMetadata has num_delivered
|
||||
num_delivered = metadata.num_delivered
|
||||
except Exception:
|
||||
num_delivered = 1
|
||||
|
||||
if num_delivered >= self.max_retries:
|
||||
logger.error(
|
||||
"Max retries exceeded, sending to DLQ",
|
||||
topic=topic,
|
||||
event_id=payload.event_id if payload else "unknown",
|
||||
error=str(e),
|
||||
num_delivered=num_delivered,
|
||||
)
|
||||
|
||||
if self.dlq and payload:
|
||||
await self.dlq.send_to_dlq(
|
||||
topic=topic,
|
||||
payload=payload,
|
||||
error=e,
|
||||
retry_count=num_delivered,
|
||||
original_message_data=message.data,
|
||||
)
|
||||
EventMetricsCollector.record_dlq(topic, error_type)
|
||||
|
||||
# Ack to remove from main stream
|
||||
await message.ack()
|
||||
|
||||
else:
|
||||
# Retry (Nak)
|
||||
logger.warning(
|
||||
"Processing failed, retrying",
|
||||
topic=topic,
|
||||
event_id=payload.event_id if payload else "unknown",
|
||||
error=str(e),
|
||||
attempt=num_delivered,
|
||||
)
|
||||
EventMetricsCollector.record_retry(topic, num_delivered)
|
||||
await message.nak()
|
||||
|
||||
except TimeoutError:
|
||||
# No messages available, continue polling
|
||||
continue
|
||||
except Exception as e: # pylint: disable=broad-exception-caught
|
||||
logger.error("Consumer error", topic=topic, error=str(e))
|
||||
await asyncio.sleep(5) # Wait before retrying
|
||||
await asyncio.sleep(1) # Wait before retrying
|
||||
|
||||
Reference in New Issue
Block a user