completed local setup with compose
Some checks failed
CI/CD Pipeline / Generate SBOM (push) Has been cancelled
CI/CD Pipeline / Deploy to Staging (push) Has been cancelled
CI/CD Pipeline / Deploy to Production (push) Has been cancelled
CI/CD Pipeline / Code Quality & Linting (push) Has been cancelled
CI/CD Pipeline / Policy Validation (push) Has been cancelled
CI/CD Pipeline / Test Suite (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-firm-connectors) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-forms) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-hmrc) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ingestion) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-normalize-map) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ocr) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-indexer) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-reason) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rpa) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (ui-review) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (ui-review) (push) Has been cancelled
CI/CD Pipeline / Notifications (push) Has been cancelled

This commit is contained in:
harkon
2025-11-26 13:17:17 +00:00
parent 8fe5e62fee
commit fdba81809f
87 changed files with 5610 additions and 3376 deletions

View File

@@ -2,6 +2,7 @@
import asyncio
import json
import time
from collections.abc import Awaitable, Callable
from typing import Any
@@ -12,6 +13,8 @@ from nats.js import JetStreamContext
from nats.js.api import AckPolicy, ConsumerConfig, DeliverPolicy
from .base import EventBus, EventPayload
from .dlq import DLQHandler
from .metrics import EventMetricsCollector
logger = structlog.get_logger()
@@ -24,6 +27,8 @@ class NATSEventBus(EventBus): # pylint: disable=too-many-instance-attributes
servers: str | list[str] = "nats://localhost:4222",
stream_name: str = "TAX_AGENT_EVENTS",
consumer_group: str = "tax-agent",
dlq_stream_name: str = "TAX_AGENT_DLQ",
max_retries: int = 3,
):
if isinstance(servers, str):
self.servers = [servers]
@@ -32,8 +37,13 @@ class NATSEventBus(EventBus): # pylint: disable=too-many-instance-attributes
self.stream_name = stream_name
self.consumer_group = consumer_group
self.dlq_stream_name = dlq_stream_name
self.max_retries = max_retries
self.nc: NATS | None = None
self.js: JetStreamContext | None = None
self.dlq: DLQHandler | None = None
self.handlers: dict[
str, list[Callable[[str, EventPayload], Awaitable[None]]]
] = {}
@@ -48,19 +58,32 @@ class NATSEventBus(EventBus): # pylint: disable=too-many-instance-attributes
try:
# Connect to NATS
self.nc = await nats.connect(servers=self.servers)
self.nc = await nats.connect(
servers=self.servers,
connect_timeout=10,
reconnect_time_wait=1,
)
# Get JetStream context
self.js = self.nc.jetstream()
self.js = self.nc.jetstream(timeout=10)
# Ensure stream exists
# Initialize DLQ handler
self.dlq = DLQHandler(
js=self.js,
dlq_stream_name=self.dlq_stream_name,
max_retries=self.max_retries,
)
# Ensure streams exist
await self._ensure_stream_exists()
await self.dlq.ensure_dlq_stream_exists()
self.running = True
logger.info(
"NATS event bus started",
servers=self.servers,
stream=self.stream_name,
dlq_stream=self.dlq_stream_name,
)
except Exception as e:
@@ -98,6 +121,7 @@ class NATSEventBus(EventBus): # pylint: disable=too-many-instance-attributes
if not self.js:
raise RuntimeError("Event bus not started")
start_time = time.perf_counter()
try:
# Create subject name from topic
subject = f"{self.stream_name}.{topic}"
@@ -117,6 +141,13 @@ class NATSEventBus(EventBus): # pylint: disable=too-many-instance-attributes
headers=headers,
)
duration = time.perf_counter() - start_time
EventMetricsCollector.record_publish(
topic=topic,
duration_seconds=duration,
success=True,
)
logger.info(
"Event published",
topic=topic,
@@ -127,6 +158,14 @@ class NATSEventBus(EventBus): # pylint: disable=too-many-instance-attributes
return True
except Exception as e: # pylint: disable=broad-exception-caught
duration = time.perf_counter() - start_time
EventMetricsCollector.record_publish(
topic=topic,
duration_seconds=duration,
success=False,
error_type=type(e).__name__,
)
logger.error(
"Failed to publish event",
topic=topic,
@@ -152,9 +191,13 @@ class NATSEventBus(EventBus): # pylint: disable=too-many-instance-attributes
subject = f"{self.stream_name}.{topic}"
# Create durable consumer
consumer_name = f"{self.consumer_group}-{topic}"
# Durable names cannot contain dots, so we replace them
safe_topic = topic.replace(".", "-")
consumer_name = f"{self.consumer_group}-{safe_topic}"
# Subscribe with pull-based consumer
# Set max_deliver to max_retries + 1 (initial + retries)
# We handle DLQ manually before NATS gives up
subscription = await self.js.pull_subscribe(
subject=subject,
durable=consumer_name,
@@ -162,7 +205,7 @@ class NATSEventBus(EventBus): # pylint: disable=too-many-instance-attributes
durable_name=consumer_name,
ack_policy=AckPolicy.EXPLICIT,
deliver_policy=DeliverPolicy.NEW,
max_deliver=3,
max_deliver=self.max_retries + 2, # Give us room to handle DLQ
ack_wait=30, # 30 seconds
),
)
@@ -193,13 +236,14 @@ class NATSEventBus(EventBus): # pylint: disable=too-many-instance-attributes
# Try to get stream info
await self.js.stream_info(self.stream_name)
logger.debug("Stream already exists", stream=self.stream_name)
EventMetricsCollector.record_nats_stream_message(self.stream_name)
except Exception:
# Stream doesn't exist, create it
try:
await self.js.add_stream(
name=self.stream_name,
subjects=[f"{self.stream_name}.*"],
subjects=[f"{self.stream_name}.>"],
)
logger.info("Created JetStream stream", stream=self.stream_name)
@@ -214,12 +258,17 @@ class NATSEventBus(EventBus): # pylint: disable=too-many-instance-attributes
while self.running:
try:
# Fetch messages in batches
messages = await subscription.fetch(batch=10, timeout=20)
messages = await subscription.fetch(batch=10, timeout=5)
for message in messages:
start_time = time.perf_counter()
payload = None
try:
print(f"DEBUG: Received message: {message.data}")
# Parse message payload
payload_dict = json.loads(message.data.decode())
print(f"DEBUG: Parsed payload: {payload_dict}")
payload = EventPayload(
data=payload_dict["data"],
@@ -230,38 +279,87 @@ class NATSEventBus(EventBus): # pylint: disable=too-many-instance-attributes
)
payload.event_id = payload_dict["event_id"]
payload.occurred_at = payload_dict["occurred_at"]
print(f"DEBUG: Reconstructed payload: {payload.event_id}")
# Call all handlers for this topic
for handler in self.handlers.get(topic, []):
try:
await handler(topic, payload)
except (
Exception
) as e: # pylint: disable=broad-exception-caught
logger.error(
"Handler failed",
topic=topic,
event_id=payload.event_id,
error=str(e),
)
print(f"DEBUG: Calling handler for topic {topic}")
await handler(topic, payload)
# Acknowledge message
await message.ack()
print("DEBUG: Message acked")
except json.JSONDecodeError as e:
logger.error(
"Failed to decode message", topic=topic, error=str(e)
# Record metrics
duration = time.perf_counter() - start_time
EventMetricsCollector.record_consume(
topic=topic,
consumer_group=self.consumer_group,
duration_seconds=duration,
success=True,
)
await message.nak()
except Exception as e: # pylint: disable=broad-exception-caught
logger.error(
"Failed to process message", topic=topic, error=str(e)
duration = time.perf_counter() - start_time
error_type = type(e).__name__
# Record failure metric
EventMetricsCollector.record_consume(
topic=topic,
consumer_group=self.consumer_group,
duration_seconds=duration,
success=False,
error_type=error_type,
)
await message.nak()
# Check delivery count for DLQ
try:
metadata = message.metadata
num_delivered = (
metadata.sequence.consumer
) # This might be wrong, check docs
# Actually nats-py MsgMetadata has num_delivered
num_delivered = metadata.num_delivered
except Exception:
num_delivered = 1
if num_delivered >= self.max_retries:
logger.error(
"Max retries exceeded, sending to DLQ",
topic=topic,
event_id=payload.event_id if payload else "unknown",
error=str(e),
num_delivered=num_delivered,
)
if self.dlq and payload:
await self.dlq.send_to_dlq(
topic=topic,
payload=payload,
error=e,
retry_count=num_delivered,
original_message_data=message.data,
)
EventMetricsCollector.record_dlq(topic, error_type)
# Ack to remove from main stream
await message.ack()
else:
# Retry (Nak)
logger.warning(
"Processing failed, retrying",
topic=topic,
event_id=payload.event_id if payload else "unknown",
error=str(e),
attempt=num_delivered,
)
EventMetricsCollector.record_retry(topic, num_delivered)
await message.nak()
except TimeoutError:
# No messages available, continue polling
continue
except Exception as e: # pylint: disable=broad-exception-caught
logger.error("Consumer error", topic=topic, error=str(e))
await asyncio.sleep(5) # Wait before retrying
await asyncio.sleep(1) # Wait before retrying