completed local setup with compose
Some checks failed
CI/CD Pipeline / Generate SBOM (push) Has been cancelled
CI/CD Pipeline / Deploy to Staging (push) Has been cancelled
CI/CD Pipeline / Deploy to Production (push) Has been cancelled
CI/CD Pipeline / Code Quality & Linting (push) Has been cancelled
CI/CD Pipeline / Policy Validation (push) Has been cancelled
CI/CD Pipeline / Test Suite (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-firm-connectors) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-forms) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-hmrc) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ingestion) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-normalize-map) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ocr) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-indexer) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-reason) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rpa) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (ui-review) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (ui-review) (push) Has been cancelled
CI/CD Pipeline / Notifications (push) Has been cancelled

This commit is contained in:
harkon
2025-11-26 13:17:17 +00:00
parent 8fe5e62fee
commit fdba81809f
87 changed files with 5610 additions and 3376 deletions

1
.gitignore vendored
View File

@@ -99,6 +99,7 @@ target/
# IPython
profile_default/
ipython_config.py
.env.*
# pyenv
# For a library or package, you might want to ignore these files since the code is

0
GEMINI.md Normal file
View File

View File

@@ -15,10 +15,7 @@ help: ## Show this help message
# Environment setup
bootstrap: ## Bootstrap the development environment
@echo "🚀 Bootstrapping AI Tax Agent System..."
@if [ ! -f infra/compose/.env ]; then \
cp infra/compose/env.example infra/compose/.env; \
echo "📝 Created .env file from template"; \
fi
@./scripts/generate-secrets.sh
@mkdir -p data/{postgres,neo4j,qdrant,minio,vault,redis,prometheus,grafana,loki,authentik}
@mkdir -p logs/{services,infra}
@mkdir -p certs
@@ -32,6 +29,7 @@ networks: ## Create external Docker networks
generate-secrets: ## Generate secure secrets for deployment
@./scripts/generate-secrets.sh
@ln -sf ../environments/local/.env infra/compose/.env
setup-authentik: ## Configure Authentik SSO after deployment
@./scripts/setup-authentik.sh
@@ -39,19 +37,22 @@ setup-authentik: ## Configure Authentik SSO after deployment
complete-authentik-setup: ## Complete Authentik initial setup and get API token
@./scripts/complete-authentik-setup.sh
auto-setup-authentik: ## Automatically complete Authentik initial setup
@./scripts/auto-setup-authentik.sh
setup-sso: ## Complete end-to-end SSO setup (setup + configuration)
@echo "🔐 Setting up complete SSO configuration..."
@echo "Step 1: Attempting automatic initial setup..."
@./scripts/auto-setup-authentik.sh || true
@echo "Step 2: Getting API token..."
@echo "Step 1: Completing Authentik initial setup..."
@./scripts/complete-authentik-setup.sh || true
@echo "Step 3: Importing blueprint configuration..."
@./scripts/setup-authentik.sh
@echo "Step 4: Configuring Vault OIDC..."
@./scripts/setup-vault.sh
@echo "🎉 SSO setup complete!"
setup-vault: ## Configure Vault OIDC
@./scripts/setup-vault.sh
fix-databases: ## Fix common database issues
@echo "🔧 Fixing database issues..."
@./scripts/fix-database-issues.sh
@@ -62,40 +63,40 @@ deploy-with-fixes: ## Deploy with all discovered fixes applied
networks-clean: ## Remove external Docker networks
@echo "🧹 Removing external Docker networks..."
@docker network rm ai-tax-agent-frontend 2>/dev/null || true
@docker network rm ai-tax-agent-backend 2>/dev/null || true
@docker network rm apa-frontend 2>/dev/null || true
@docker network rm apa-backend 2>/dev/null || true
@echo "✅ Networks removed"
# Development lifecycle
run: ## Start all services in development mode
@echo "🏃 Starting AI Tax Agent System..."
@./scripts/deploy.sh
@./infra/scripts/deploy.sh local all
run-simple: ## Start all services without fixes (original behavior)
@echo "🏃 Starting AI Tax Agent System (simple)..."
@./scripts/create-networks.sh
@./scripts/generate-dev-certs.sh
@cd infra/compose && docker compose -f docker-compose.local.yml up -d
@cd infra/compose && docker compose up -d
@echo "⏳ Waiting for services to be ready..."
@sleep 10
@make status
@echo "🔧 Run 'make setup-authentik' to configure SSO"
@echo "🔧 Run 'make setup-sso' to configure SSO"
setup: generate-secrets deploy-infra ## Complete setup with secrets and infrastructure
@echo "🎉 Setup complete! Next steps:"
@echo " 1. Run 'make setup-authentik' to configure SSO"
@echo " 1. Run 'make setup-sso' to configure SSO"
@echo " 2. Run 'make deploy-services' to start application services"
@echo " 3. Access Authentik at https://auth.local"
@echo " 3. Access Authentik at https://auth.local.lan"
@echo ""
@echo "🎉 System is running!"
@echo "📊 Grafana: https://grafana.local"
@echo "🔐 Authentik: https://auth.local"
@echo "📝 Review UI: https://review.local"
@echo "📊 Grafana: https://grafana.local.lan"
@echo "🔐 Authentik: https://auth.local.lan"
@echo "📝 Review UI: https://review.local.lan"
@echo "🔧 Traefik Dashboard: http://localhost:8080"
stop: ## Stop all services
@echo "🛑 Stopping AI Tax Agent System..."
@cd infra/compose && docker compose -f docker-compose.local.yml down
@cd infra/compose && docker compose down
restart: ## Restart all services
@echo "🔄 Restarting AI Tax Agent System..."
@@ -105,30 +106,30 @@ restart: ## Restart all services
# Build and deployment
build: ## Build all Docker images
@echo "🔨 Building Docker images..."
@cd infra/compose && docker compose -f docker-compose.local.yml build --parallel
@cd infra/compose && docker compose build --parallel
@echo "✅ Build complete"
build-service: ## Build specific service (usage: make build-service SERVICE=svc-ingestion)
@echo "🔨 Building $(SERVICE)..."
@cd infra/compose && docker compose -f docker-compose.local.yml build $(SERVICE)
@cd infra/compose && docker compose build $(SERVICE)
@echo "✅ Build complete for $(SERVICE)"
deploy-infra: networks ## Deploy only infrastructure services
@echo "🏗️ Deploying infrastructure services..."
@./scripts/generate-dev-certs.sh
@cd infra/compose && docker compose -f docker-compose.local.yml up -d ata-traefik ata-postgres ata-redis ata-authentik-db ata-authentik-redis
@cd infra/compose && docker compose up -d apa-traefik apa-postgres apa-redis apa-authentik-db apa-authentik-redis
@echo "⏳ Waiting for databases..."
@sleep 15
@make fix-databases
@cd infra/compose && docker compose -f docker-compose.local.yml up -d ata-authentik-server ata-authentik-worker ata-authentik-outpost ata-vault ata-neo4j ata-qdrant ata-minio ata-prometheus ata-grafana ata-loki
@cd infra/compose && docker compose up -d apa-authentik-server apa-authentik-worker apa-authentik-outpost apa-vault apa-neo4j apa-qdrant apa-minio apa-prometheus apa-grafana apa-loki
@echo "✅ Infrastructure deployment complete"
@echo "⏳ Waiting for services to be ready..."
@sleep 30
@echo "🔧 Run 'make setup-authentik' to configure SSO"
@echo "🔧 Run 'make setup-sso' to configure SSO"
deploy-services: ## Deploy only application services
@echo "🚀 Deploying application services..."
@cd infra/compose && docker compose -f docker-compose.local.yml up -d ata-svc-ingestion ata-svc-extract ata-svc-forms ata-svc-hmrc ata-svc-kg ata-svc-normalize-map ata-svc-ocr ata-svc-rag-indexer ata-svc-rag-retriever ata-svc-reason ata-svc-rpa ata-svc-firm-connectors ata-ui-review ata-unleash
@cd infra/compose && docker compose up -d apa-svc-ingestion apa-svc-extract apa-svc-forms apa-svc-hmrc apa-svc-kg apa-svc-normalize-map apa-svc-ocr apa-svc-rag-indexer apa-svc-rag-retriever apa-svc-reason apa-svc-rpa apa-svc-firm-connectors
@echo "✅ Services deployment complete"
# Development tools
@@ -236,7 +237,7 @@ deploy-monitoring-prod: ## Deploy monitoring stack (production)
seed: ## Seed the system with initial data
@echo "🌱 Seeding system with initial data..."
@echo "📊 Creating Neo4j constraints and indexes..."
@docker exec ata-neo4j cypher-shell -u neo4j -p $(NEO4J_PASSWORD) -f /var/lib/neo4j/import/schema.cypher 2>/dev/null || echo "Neo4j not ready"
@docker exec apa-neo4j cypher-shell -u neo4j -p $(NEO4J_PASSWORD) -f /var/lib/neo4j/import/schema.cypher 2>/dev/null || echo "Neo4j not ready"
@echo "🗂️ Creating Qdrant collections..."
@curl -X PUT "http://localhost:6333/collections/documents" -H "Content-Type: application/json" -d '{"vectors": {"size": 1536, "distance": "Cosine"}}' 2>/dev/null || echo "Qdrant not ready"
@echo "✅ Seeding complete"
@@ -247,7 +248,7 @@ seed-test-data: ## Load test data for development
# Monitoring and debugging
logs: ## Show logs from all services
@cd infra/compose && docker compose -f docker-compose.local.yml logs -f
@cd infra/compose && docker compose logs -f
logs-service: ## Show logs from specific service (usage: make logs-service SERVICE=svc-extract)
@@ -255,22 +256,22 @@ logs-service: ## Show logs from specific service (usage: make logs-service SERVI
echo "❌ Please specify SERVICE (e.g., make logs-service SERVICE=svc-extract)"; \
exit 1; \
fi
@cd infra/compose && docker compose -f docker-compose.local.yml logs -f $(SERVICE)
@cd infra/compose && docker compose logs -f $(SERVICE)
status: ## Show status of all services
@echo "📊 Service Status:"
@cd infra/compose && docker compose -f docker-compose.local.yml ps
@cd infra/compose && docker compose ps
health: ## Check health of all services
@echo "🏥 Health Check:"
@echo "🔗 Traefik: $$(curl -s -o /dev/null -w '%{http_code}' http://localhost:8080/ping || echo 'DOWN')"
@echo "🗄️ PostgreSQL: $$(docker exec ata-postgres pg_isready -U postgres 2>/dev/null && echo 'UP' || echo 'DOWN')"
@echo "🗄️ PostgreSQL: $$(docker exec apa-postgres pg_isready -U postgres 2>/dev/null && echo 'UP' || echo 'DOWN')"
@echo "📊 Neo4j: $$(curl -s -o /dev/null -w '%{http_code}' http://localhost:7474 || echo 'DOWN')"
@echo "🔍 Qdrant: $$(curl -s -o /dev/null -w '%{http_code}' http://localhost:6333/health || echo 'DOWN')"
@echo "📦 MinIO: $$(curl -s -o /dev/null -w '%{http_code}' http://localhost:9000/minio/health/live || echo 'DOWN')"
@echo "🔐 Vault: $$(curl -s -o /dev/null -w '%{http_code}' http://localhost:8200/v1/sys/health || echo 'DOWN')"
@echo "🏃 Redis: $$(docker exec ata-redis redis-cli ping 2>/dev/null || echo 'DOWN')"
@echo "🔐 Authentik: $$(curl -s -k -o /dev/null -w '%{http_code}' https://auth.local || echo 'DOWN')"
@echo "🏃 Redis: $$(docker exec apa-redis redis-cli ping 2>/dev/null || echo 'DOWN')"
@echo "🔐 Authentik: $$(curl -s -k -o /dev/null -w '%{http_code}' https://auth.local.lan || echo 'DOWN')"
verify: ## Run comprehensive infrastructure verification
@echo "🔍 Running infrastructure verification..."
@@ -282,24 +283,24 @@ troubleshoot: ## Run comprehensive troubleshooting and fixes
restart-authentik: ## Restart Authentik components in correct order
@echo "🔄 Restarting Authentik components..."
@cd infra/compose && docker compose -f docker-compose.local.yml stop ata-authentik-server ata-authentik-worker ata-authentik-outpost
@cd infra/compose && docker compose stop apa-authentik-server apa-authentik-worker apa-authentik-outpost
@make fix-databases
@cd infra/compose && docker compose -f docker-compose.local.yml up -d ata-authentik-server
@cd infra/compose && docker compose up -d apa-authentik-server
@sleep 15
@cd infra/compose && docker compose -f docker-compose.local.yml up -d ata-authentik-worker ata-authentik-outpost
@cd infra/compose && docker compose up -d apa-authentik-worker apa-authentik-outpost
@echo "✅ Authentik restart complete"
restart-unleash: ## Restart Unleash with database fixes
@echo "🔄 Restarting Unleash..."
@cd infra/compose && docker compose -f docker-compose.local.yml stop ata-unleash
@cd infra/compose && docker compose stop apa-unleash
@make fix-databases
@cd infra/compose && docker compose -f docker-compose.local.yml up -d ata-unleash
@cd infra/compose && docker compose up -d apa-unleash
@echo "✅ Unleash restart complete"
# Cleanup
clean: ## Clean up containers, volumes, and networks
@echo "🧹 Cleaning up..."
@cd infra/compose && docker compose -f docker-compose.local.yml down -v --remove-orphans
@cd infra/compose && docker compose down -v --remove-orphans
@docker system prune -f
@echo "✅ Cleanup complete"
@@ -320,13 +321,13 @@ shell: ## Open shell in specific service (usage: make shell SERVICE=svc-extract)
@docker exec -it $(SERVICE) /bin/bash
db-shell: ## Open PostgreSQL shell
@docker exec -it ata-postgres psql -U postgres -d tax_system
@docker exec -it apa-postgres psql -U postgres -d tax_system
neo4j-shell: ## Open Neo4j shell
@docker exec -it ata-neo4j cypher-shell -u neo4j -p $(NEO4J_PASSWORD)
@docker exec -it apa-neo4j cypher-shell -u neo4j -p $(NEO4J_PASSWORD)
redis-shell: ## Open Redis shell
@docker exec -it ata-redis redis-cli
@docker exec -it apa-redis redis-cli
# Documentation
docs: ## Generate documentation
@@ -361,9 +362,9 @@ load-test: ## Run load tests
backup: ## Create backup of all data
@echo "💾 Creating backup..."
@mkdir -p backups/$$(date +%Y%m%d_%H%M%S)
@docker exec ata-postgres pg_dump -U postgres tax_system > backups/$$(date +%Y%m%d_%H%M%S)/postgres.sql
@docker exec ata-neo4j neo4j-admin dump --database=neo4j --to=/tmp/neo4j.dump
@docker cp ata-neo4j:/tmp/neo4j.dump backups/$$(date +%Y%m%d_%H%M%S)/
@docker exec apa-postgres pg_dump -U postgres tax_system > backups/$$(date +%Y%m%d_%H%M%S)/postgres.sql
@docker exec apa-neo4j neo4j-admin dump --database=neo4j --to=/tmp/neo4j.dump
@docker cp apa-neo4j:/tmp/neo4j.dump backups/$$(date +%Y%m%d_%H%M%S)/
@echo "✅ Backup created in backups/ directory"
restore: ## Restore from backup (usage: make restore BACKUP=20240101_120000)
@@ -374,9 +375,9 @@ restore: ## Restore from backup (usage: make restore BACKUP=20240101_120000)
@echo "📥 Restoring from backup $(BACKUP)..."
@echo "⚠️ This will overwrite existing data!"
@read -p "Are you sure? (y/N): " confirm && [ "$$confirm" = "y" ] || exit 1
@docker exec -i ata-postgres psql -U postgres -d tax_system < backups/$(BACKUP)/postgres.sql
@docker cp backups/$(BACKUP)/neo4j.dump ata-neo4j:/tmp/
@docker exec ata-neo4j neo4j-admin load --database=neo4j --from=/tmp/neo4j.dump --force
@docker exec -i apa-postgres psql -U postgres -d tax_system < backups/$(BACKUP)/postgres.sql
@docker cp backups/$(BACKUP)/neo4j.dump apa-neo4j:/tmp/
@docker exec apa-neo4j neo4j-admin load --database=neo4j --from=/tmp/neo4j.dump --force
@echo "✅ Restore complete"
# Environment variables

View File

@@ -188,8 +188,7 @@ ai-tax-agent-2/
│ └── svc-firm-connectors/ # Firm integration service
├── infra/ # Infrastructure
│ ├── compose/ # Docker Compose files
── k8s/ # Kubernetes manifests
│ └── terraform/ # Terraform configurations
── k8s/ # Kubernetes manifests
├── tests/ # Test suites
│ ├── e2e/ # End-to-end tests
│ └── unit/ # Unit tests

66
SETUP.md Normal file
View File

@@ -0,0 +1,66 @@
# AI Tax Agent - Setup Guide
This guide describes how to set up the AI Tax Agent infrastructure from scratch.
## Prerequisites
- Docker Desktop (latest version)
- Make
- Python 3.11+
- **Host Networking**: Add the following to your `/etc/hosts` file:
```text
127.0.0.1 local.lan traefik.local.lan auth.local.lan api.local.lan minio.local.lan vault.local.lan grafana.local.lan
```
## Quick Start (Fresh Install)
To start the entire system from a clean slate:
1. **Clean up existing resources** (WARNING: This deletes all data):
```bash
make clean-data
```
2. **Bootstrap the environment**:
This generates secure secrets and creates necessary directories.
```bash
make bootstrap
```
3. **Deploy Infrastructure**:
This starts all core services (Databases, Authentik, Vault, MinIO, etc.).
```bash
make deploy-infra
```
_Wait for about 30-60 seconds for services to initialize._
4. **Deploy Application Services**:
This starts the AI Tax Agent microservices.
```bash
make deploy-services
```
## Verification
Once everything is up, you can access the following services:
- **Authentik (SSO)**: [https://auth.local.lan](https://auth.local.lan)
- Username: `admin@local.lan`
- Password: See `infra/environments/local/.env` (look for `AUTHENTIK_BOOTSTRAP_PASSWORD` or `admin123` default)
- **Traefik Dashboard**: [https://traefik.local.lan/dashboard/](https://traefik.local.lan/dashboard/)
- **Grafana**: [https://grafana.local.lan](https://grafana.local.lan)
- **MinIO Console**: [https://minio.local.lan](https://minio.local.lan)
- **Vault**: [https://vault.local.lan](https://vault.local.lan)
- **API Health**: [https://api.local.lan/ingestion/health](https://api.local.lan/ingestion/health)
## Troubleshooting
If services fail to start or connect:
- Check logs: `make logs`
- Check status: `make status`
- Restart Authentik (if SSO issues): `make restart-authentik`

View File

@@ -13,9 +13,10 @@ ENV PATH="/opt/venv/bin:$PATH"
# Copy requirements and install dependencies
COPY libs/requirements-base.txt /tmp/libs-requirements.txt
COPY libs/requirements-ml.txt /tmp/libs-ml-requirements.txt
COPY apps/svc_extract/requirements.txt /tmp/requirements.txt
RUN pip install --no-cache-dir --upgrade pip && \
pip install --no-cache-dir -r /tmp/libs-requirements.txt -r /tmp/requirements.txt
pip install --no-cache-dir -r /tmp/libs-requirements.txt -r /tmp/libs-ml-requirements.txt -r /tmp/requirements.txt
# Production stage
FROM python:3.12-slim

View File

@@ -43,7 +43,7 @@ RUN chown -R appuser:appuser /app
USER appuser
# Health check
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
HEALTHCHECK --interval=30s --timeout=10s --start-period=30s --retries=3 \
CMD curl -f http://localhost:8000/healthz || exit 1
# Expose port

View File

@@ -44,7 +44,7 @@ RUN chown -R appuser:appuser /app
USER appuser
# Health check
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
HEALTHCHECK --interval=30s --timeout=10s --start-period=30s --retries=3 \
CMD curl -f http://localhost:8000/healthz || exit 1
# Expose port

View File

@@ -158,13 +158,13 @@ async def upload_document(
event_payload = EventPayload(
data={
"doc_id": doc_id,
"tenant_id": tenant_id,
"filename": file.filename or "unknown",
"kind": kind.value,
"source": source,
"checksum": checksum,
"file_size": len(content),
"content_type": content_type,
"s3_url": storage_result["s3_url"],
"checksum_sha256": checksum,
"size_bytes": len(content),
"mime_type": content_type,
"storage_path": storage_result["s3_url"],
},
actor=current_user.get("sub", "system"),
tenant_id=tenant_id,

View File

@@ -1,54 +1,27 @@
# Multi-stage build for svc_kg
FROM python:3.12-slim AS builder
FROM python:3.12-slim-bookworm
# Install build dependencies
RUN apt-get update && apt-get install -y \
build-essential \
curl \
&& rm -rf /var/lib/apt/lists/*
# Set environment variables
ENV PYTHONUNBUFFERED 1
ENV APP_HOME /app
# Create virtual environment
RUN python -m venv /opt/venv
ENV PATH="/opt/venv/bin:$PATH"
# Create and set working directory
WORKDIR $APP_HOME
# Copy requirements and install dependencies
# Install dependencies
COPY libs/requirements-base.txt /tmp/libs-requirements.txt
COPY libs/requirements-rdf.txt /tmp/libs-rdf.txt
COPY apps/svc_kg/requirements.txt /tmp/requirements.txt
RUN pip install --no-cache-dir --upgrade pip && \
pip install --no-cache-dir -r /tmp/libs-requirements.txt -r /tmp/libs-rdf.txt -r /tmp/requirements.txt
# Production stage
FROM python:3.12-slim
# Install runtime dependencies
RUN apt-get update && apt-get install -y \
curl \
&& rm -rf /var/lib/apt/lists/* \
&& groupadd -r appuser \
&& useradd -r -g appuser appuser
# Copy virtual environment from builder
COPY --from=builder /opt/venv /opt/venv
ENV PATH="/opt/venv/bin:$PATH"
# Set working directory
WORKDIR /app
RUN pip install --no-cache-dir -r /tmp/libs-requirements.txt -r /tmp/requirements.txt
# Copy application code
COPY libs/ ./libs/
COPY apps/svc_kg/ ./apps/svc_kg/
# Create non-root user and set permissions
RUN chown -R appuser:appuser /app
USER appuser
# Health check
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
CMD curl -f http://localhost:8000/healthz || exit 1
# Expose port
EXPOSE 8000
# Run the application
CMD ["python", "-m", "uvicorn", "apps.svc_kg.main:app", "--host", "0.0.0.0", "--port", "8000"]

View File

@@ -1,28 +1,22 @@
# FILE: apps/svc-kg/main.py
# Knowledge graph facade with CRUD, queries, lineage, and SHACL validation
import json
import os
# Import shared libraries
import sys
from datetime import datetime
from typing import Any
from typing import Any, cast
import structlog
from fastapi import Depends, HTTPException, Query, Request
from fastapi import HTTPException, Request
from fastapi.responses import JSONResponse
from pyshacl import validate
from rdflib import Graph, Literal, URIRef
from rdflib.namespace import RDF
sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
from libs.app_factory import create_app
from libs.config import BaseAppSettings, create_event_bus, create_neo4j_client
from libs.events import EventBus
from libs.neo import Neo4jClient, SHACLValidator, TemporalQueries
from libs.events import EventBus, EventPayload, EventTopics
from libs.neo import Neo4jClient
from libs.observability import get_metrics, get_tracer, setup_observability
from libs.schemas import ErrorResponse
from libs.security import get_current_user, get_tenant_id
logger = structlog.get_logger()
@@ -31,523 +25,193 @@ class KGSettings(BaseAppSettings):
"""Settings for KG service"""
service_name: str = "svc-kg"
shacl_shapes_path: str = "schemas/shapes.ttl"
# SHACL validation
shapes_file: str = "schemas/shapes.ttl"
validate_on_write: bool = True
# Query limits
max_results: int = 1000
max_depth: int = 10
query_timeout: int = 30
# Create app and settings
app, settings = create_app(
service_name="svc-kg",
title="Tax Agent Knowledge Graph Service",
description="Knowledge graph facade with CRUD and queries",
settings_class=KGSettings,
)
# Global clients
neo4j_client: Neo4jClient | None = None
shacl_validator: SHACLValidator | None = None
event_bus: EventBus | None = None
tracer = get_tracer("svc-kg")
metrics = get_metrics()
shapes_graph: Graph | None = None
settings: KGSettings
@app.on_event("startup")
async def startup_event() -> None:
async def init_dependencies(app_settings: KGSettings) -> None:
"""Initialize service dependencies"""
global neo4j_client, shacl_validator, event_bus
global neo4j_client, event_bus, settings, shapes_graph
settings = app_settings
logger.info("Starting KG service")
# Setup observability
setup_observability(settings)
# Initialize Neo4j client
neo4j_driver = create_neo4j_client(settings)
neo4j_client = Neo4jClient(neo4j_driver)
# Initialize SHACL validator
if os.path.exists(settings.shapes_file):
shacl_validator = SHACLValidator(settings.shapes_file)
# Initialize event bus
event_bus = create_event_bus(settings)
if not event_bus:
raise HTTPException(status_code=500, detail="Event bus not initialized")
await event_bus.start()
logger.info("KG service started successfully")
await event_bus.subscribe(EventTopics.KG_UPSERT_READY, _handle_kg_upsert_ready)
# Load SHACL shapes
try:
shapes_graph = Graph().parse(settings.shacl_shapes_path, format="turtle")
logger.info("SHACL shapes loaded successfully")
except Exception as e:
logger.error("Failed to load SHACL shapes", error=str(e))
shapes_graph = None
app, _settings = create_app(
service_name="svc-kg",
title="Tax Agent Knowledge Graph Service",
description="Service for managing and validating the Knowledge Graph",
settings_class=KGSettings,
)
# Initialize dependencies immediately
@app.on_event("startup")
async def startup_event():
await init_dependencies(cast(KGSettings, _settings))
tracer = get_tracer("svc-kg")
metrics = get_metrics()
@app.on_event("shutdown")
async def shutdown_event() -> None:
"""Cleanup service dependencies"""
global neo4j_client, event_bus
global event_bus, neo4j_client
logger.info("Shutting down KG service")
if neo4j_client:
await neo4j_client.close()
if event_bus:
await event_bus.stop()
if neo4j_client:
await neo4j_client.close()
logger.info("KG service shutdown complete")
@app.get("/health")
async def health_check() -> dict[str, Any]:
"""Health check endpoint"""
return {
"status": "healthy",
"service": settings.service_name,
"version": settings.service_version,
"timestamp": datetime.utcnow().isoformat(),
}
async def _handle_kg_upsert_ready(topic: str, payload: EventPayload) -> None:
"""Handle KG upsert ready events"""
data = payload.data
nodes = data.get("nodes", [])
relationships = data.get("relationships", [])
document_id = data.get("document_id")
tenant_id = data.get("tenant_id")
if not nodes and not relationships:
logger.warning("No nodes or relationships to upsert", data=data)
return
@app.post("/nodes/{label}")
async def create_node(
label: str,
properties: dict[str, Any],
current_user: dict[str, Any] = Depends(get_current_user),
tenant_id: str = Depends(get_tenant_id),
) -> dict[str, Any]:
"""Create a new node"""
with tracer.start_as_current_span("create_node") as span:
span.set_attribute("label", label)
with tracer.start_as_current_span("upsert_kg_data") as span:
span.set_attribute("document_id", document_id)
span.set_attribute("tenant_id", tenant_id)
span.set_attribute("node_count", len(nodes))
span.set_attribute("relationship_count", len(relationships))
try:
# Add tenant isolation
properties["tenant_id"] = tenant_id
properties["created_by"] = current_user.get("sub", "system")
# Validate with SHACL if enabled
if settings.validate_on_write and shacl_validator:
await _validate_node(label, properties)
# Create node
result = await neo4j_client.create_node(label, properties)
# Update metrics
metrics.counter("nodes_created_total").labels(
tenant_id=tenant_id, label=label
).inc()
logger.info("Node created", label=label, node_id=result.get("id"))
return {
"status": "created",
"label": label,
"properties": properties,
"neo4j_result": result,
}
except Exception as e:
logger.error("Failed to create node", label=label, error=str(e))
raise HTTPException(
status_code=500, detail=f"Failed to create node: {str(e)}"
# 1. Validate data against SHACL schema
conforms, validation_report = await _validate_with_shacl(
nodes, relationships
)
if not conforms:
logger.error(
"SHACL validation failed",
document_id=document_id,
validation_report=validation_report,
)
metrics.counter("kg_validation_errors_total").labels(
tenant_id=tenant_id
).inc()
return
# 2. Write data to Neo4j
for node in nodes:
await neo4j_client.create_node(node["type"], node["properties"]) # type: ignore
@app.get("/nodes/{label}")
async def get_nodes(
label: str,
limit: int = Query(default=100, le=settings.max_results),
filters: str | None = Query(default=None),
current_user: dict[str, Any] = Depends(get_current_user),
tenant_id: str = Depends(get_tenant_id),
) -> dict[str, Any]:
"""Get nodes by label with optional filters"""
for rel in relationships:
await neo4j_client.create_relationship( # type: ignore
rel["sourceId"],
rel["targetId"],
rel["type"],
rel["properties"],
)
with tracer.start_as_current_span("get_nodes") as span:
span.set_attribute("label", label)
span.set_attribute("tenant_id", tenant_id)
span.set_attribute("limit", limit)
try:
# Parse filters
filter_dict: dict[str, Any] = {}
if filters:
try:
filter_dict = json.loads(filters)
except json.JSONDecodeError:
raise HTTPException(status_code=400, detail="Invalid filters JSON")
# Add tenant isolation
filter_dict["tenant_id"] = tenant_id
# Build query
query = TemporalQueries.get_current_state_query(label, filter_dict)
query += f" LIMIT {limit}"
# Execute query
results = await neo4j_client.run_query(query)
# Update metrics
metrics.counter("nodes_queried_total").labels(
tenant_id=tenant_id, label=label
).inc()
return {
"label": label,
"count": len(results),
"nodes": [result["n"] for result in results],
}
except HTTPException:
raise
except Exception as e:
logger.error("Failed to get nodes", label=label, error=str(e))
raise HTTPException(
status_code=500, detail=f"Failed to get nodes: {str(e)}"
# 3. Publish kg.upserted event
event_payload = EventPayload(
data={
"document_id": document_id,
"tenant_id": tenant_id,
"taxpayer_id": data.get("taxpayer_id"),
"tax_year": data.get("tax_year"),
"node_count": len(nodes),
"relationship_count": len(relationships),
},
actor=payload.actor,
tenant_id=tenant_id,
trace_id=str(span.get_span_context().trace_id),
)
await event_bus.publish(EventTopics.KG_UPSERTED, event_payload) # type: ignore
@app.get("/nodes/{label}/{node_id}")
async def get_node(
label: str,
node_id: str,
include_lineage: bool = Query(default=False),
current_user: dict[str, Any] = Depends(get_current_user),
tenant_id: str = Depends(get_tenant_id),
) -> dict[str, Any]:
"""Get specific node with optional lineage"""
with tracer.start_as_current_span("get_node") as span:
span.set_attribute("label", label)
span.set_attribute("node_id", node_id)
span.set_attribute("tenant_id", tenant_id)
try:
# Get node
query = f"""
MATCH (n:{label} {{id: $node_id, tenant_id: $tenant_id}})
WHERE n.retracted_at IS NULL
RETURN n
"""
results = await neo4j_client.run_query(
query, {"node_id": node_id, "tenant_id": tenant_id}
)
if not results:
raise HTTPException(status_code=404, detail="Node not found")
node_data = results[0]["n"]
# Get lineage if requested
lineage: list[dict[str, Any]] = []
if include_lineage:
lineage = await neo4j_client.get_node_lineage(node_id)
return {"node": node_data, "lineage": lineage if include_lineage else None}
except HTTPException:
raise
except Exception as e:
logger.error(
"Failed to get node", label=label, node_id=node_id, error=str(e)
)
raise HTTPException(status_code=500, detail=f"Failed to get node: {str(e)}")
@app.put("/nodes/{label}/{node_id}")
async def update_node(
label: str,
node_id: str,
properties: dict[str, Any],
current_user: dict[str, Any] = Depends(get_current_user),
tenant_id: str = Depends(get_tenant_id),
) -> dict[str, Any]:
"""Update node with bitemporal versioning"""
with tracer.start_as_current_span("update_node") as span:
span.set_attribute("label", label)
span.set_attribute("node_id", node_id)
span.set_attribute("tenant_id", tenant_id)
try:
# Add metadata
properties["tenant_id"] = tenant_id
properties["updated_by"] = current_user.get("sub", "system")
# Validate with SHACL if enabled
if settings.validate_on_write and shacl_validator:
await _validate_node(label, properties)
# Update node (creates new version)
await neo4j_client.update_node(label, node_id, properties)
# Update metrics
metrics.counter("nodes_updated_total").labels(
tenant_id=tenant_id, label=label
).inc()
logger.info("Node updated", label=label, node_id=node_id)
return {
"status": "updated",
"label": label,
"node_id": node_id,
"properties": properties,
}
except Exception as e:
logger.error(
"Failed to update node", label=label, node_id=node_id, error=str(e)
)
raise HTTPException(
status_code=500, detail=f"Failed to update node: {str(e)}"
)
@app.post("/relationships")
async def create_relationship(
from_label: str,
from_id: str,
to_label: str,
to_id: str,
relationship_type: str,
properties: dict[str, Any] | None = None,
current_user: dict[str, Any] = Depends(get_current_user),
tenant_id: str = Depends(get_tenant_id),
) -> dict[str, Any]:
"""Create relationship between nodes"""
with tracer.start_as_current_span("create_relationship") as span:
span.set_attribute("from_label", from_label)
span.set_attribute("to_label", to_label)
span.set_attribute("relationship_type", relationship_type)
span.set_attribute("tenant_id", tenant_id)
try:
# Add metadata
rel_properties = properties or {}
rel_properties["tenant_id"] = tenant_id
rel_properties["created_by"] = current_user.get("sub", "system")
# Create relationship
await neo4j_client.create_relationship(
from_label, from_id, to_label, to_id, relationship_type, rel_properties
)
# Update metrics
metrics.counter("relationships_created_total").labels(
tenant_id=tenant_id, relationship_type=relationship_type
).inc()
metrics.counter("kg_upserts_total").labels(tenant_id=tenant_id).inc()
logger.info(
"Relationship created",
from_id=from_id,
to_id=to_id,
type=relationship_type,
"KG upsert completed", document_id=document_id, tenant_id=tenant_id
)
return {
"status": "created",
"from_id": from_id,
"to_id": to_id,
"relationship_type": relationship_type,
"properties": rel_properties,
}
except Exception as e:
logger.error("Failed to create relationship", error=str(e))
raise HTTPException(
status_code=500, detail=f"Failed to create relationship: {str(e)}"
logger.error(
"Failed to upsert KG data", document_id=document_id, error=str(e)
)
@app.post("/query")
async def execute_query(
query: str,
parameters: dict[str, Any] | None = None,
current_user: dict[str, Any] = Depends(get_current_user),
tenant_id: str = Depends(get_tenant_id),
) -> dict[str, Any]:
"""Execute custom Cypher query with tenant isolation"""
with tracer.start_as_current_span("execute_query") as span:
span.set_attribute("tenant_id", tenant_id)
try:
# Add tenant isolation to parameters
query_params = parameters or {}
query_params["tenant_id"] = tenant_id
# Validate query (basic security check)
if not _is_safe_query(query):
raise HTTPException(status_code=400, detail="Unsafe query detected")
# Execute query with timeout
results = await neo4j_client.run_query(query, query_params, max_retries=1)
# Update metrics
metrics.counter("custom_queries_total").labels(tenant_id=tenant_id).inc()
return {
"query": query,
"parameters": query_params,
"results": results,
"count": len(results),
}
except Exception as e:
logger.error("Query execution failed", query=query[:100], error=str(e))
raise HTTPException(status_code=500, detail=f"Query failed: {str(e)}")
@app.get("/export/rdf")
async def export_rdf(
format: str = Query(default="turtle"),
current_user: dict[str, Any] = Depends(get_current_user),
tenant_id: str = Depends(get_tenant_id),
) -> dict[str, Any]:
"""Export knowledge graph as RDF"""
with tracer.start_as_current_span("export_rdf") as span:
span.set_attribute("format", format)
span.set_attribute("tenant_id", tenant_id)
try:
# Export tenant-specific data
rdf_data = await neo4j_client.export_to_rdf(format)
# Update metrics
metrics.counter("rdf_exports_total").labels(
tenant_id=tenant_id, format=format
metrics.counter("kg_upsert_errors_total").labels(
tenant_id=tenant_id, error_type=type(e).__name__
).inc()
return {
"format": format,
"rdf_data": rdf_data,
"exported_at": datetime.utcnow().isoformat(),
}
except Exception as e:
logger.error("RDF export failed", format=format, error=str(e))
raise HTTPException(
status_code=500, detail=f"RDF export failed: {str(e)}"
) from e
async def _validate_with_shacl(
nodes: list[dict[str, Any]], relationships: list[dict[str, Any]]
) -> tuple[bool, str]:
"""Validate data against SHACL shapes."""
if not shapes_graph:
logger.warning("SHACL shapes not loaded, skipping validation.")
return True, "SHACL shapes not loaded"
data_graph = Graph()
namespace = "http://ai-tax-agent.com/ontology/"
@app.post("/validate")
async def validate_graph(
current_user: dict[str, Any] = Depends(get_current_user),
tenant_id: str = Depends(get_tenant_id),
) -> dict[str, Any]:
"""Validate knowledge graph with SHACL"""
for node in nodes:
node_uri = URIRef(f"{namespace}{node['id']}")
data_graph.add((node_uri, RDF.type, URIRef(f"{namespace}{node['type']}")))
for key, value in node["properties"].items():
if value is not None:
data_graph.add((node_uri, URIRef(f"{namespace}{key}"), Literal(value)))
with tracer.start_as_current_span("validate_graph") as span:
span.set_attribute("tenant_id", tenant_id)
try:
if not shacl_validator:
raise HTTPException(
status_code=501, detail="SHACL validation not configured"
)
# Export current graph state
rdf_export = await neo4j_client.export_to_rdf("turtle")
# Extract RDF data from export result
rdf_data = rdf_export.get("rdf_data", "")
if not rdf_data:
raise HTTPException(
status_code=500, detail="Failed to export RDF data for validation"
)
# Run SHACL validation
validation_result = await shacl_validator.validate_graph(rdf_data)
# Update metrics
metrics.counter("validations_total").labels(
tenant_id=tenant_id, conforms=validation_result["conforms"]
).inc()
return {
"conforms": validation_result["conforms"],
"violations_count": validation_result["violations_count"],
"results_text": validation_result["results_text"],
"validated_at": datetime.utcnow().isoformat(),
}
except Exception as e:
logger.error("Graph validation failed", error=str(e))
raise HTTPException(status_code=500, detail=f"Validation failed: {str(e)}")
async def _validate_node(label: str, properties: dict[str, Any]) -> bool:
"""Validate node with SHACL"""
if not shacl_validator:
return True
for rel in relationships:
source_uri = URIRef(f"{namespace}{rel['sourceId']}")
target_uri = URIRef(f"{namespace}{rel['targetId']}")
rel_uri = URIRef(f"{namespace}{rel['type']}")
data_graph.add((source_uri, rel_uri, target_uri))
try:
# Create a minimal RDF representation of the node for validation
rdf_lines = ["@prefix tax: <https://tax-kg.example.com/> ."]
node_uri = "tax:temp_node"
# Add type declaration
rdf_lines.append(f"{node_uri} a tax:{label} .")
# Add properties
for prop, value in properties.items():
if isinstance(value, str):
rdf_lines.append(f'{node_uri} tax:{prop} "{value}" .')
else:
rdf_lines.append(f"{node_uri} tax:{prop} {value} .")
rdf_data = "\n".join(rdf_lines)
# Validate the node RDF data
validation_result = await shacl_validator.validate_graph(rdf_data)
if not validation_result["conforms"]:
logger.warning(
"Node SHACL validation failed",
label=label,
violations=validation_result["violations_count"],
details=validation_result["results_text"],
)
return False
logger.debug("Node SHACL validation passed", label=label)
return True
conforms, results_graph, results_text = validate(
data_graph,
shacl_graph=shapes_graph,
ont_graph=None, # No ontology graph
inference="rdfs",
abort_on_first=False,
allow_infos=False,
meta_shacl=False,
advanced=False,
js=False,
debug=False,
)
return conforms, results_text
except Exception as e:
logger.error("Node SHACL validation error", label=label, error=str(e))
# Return True to not block operations on validation errors
return True
def _is_safe_query(query: str) -> bool:
"""Basic query safety check"""
query_lower = query.lower()
# Block dangerous operations
dangerous_keywords = [
"delete",
"remove",
"drop",
"create index",
"create constraint",
"load csv",
"call",
"foreach",
]
for keyword in dangerous_keywords:
if keyword in query_lower:
return False
return True
logger.error("Error during SHACL validation", error=str(e))
return False, str(e)
@app.exception_handler(HTTPException)
@@ -561,7 +225,7 @@ async def http_exception_handler(request: Request, exc: HTTPException) -> JSONRe
status=exc.status_code,
detail=exc.detail,
instance=str(request.url),
trace_id="",
trace_id=getattr(request.state, "trace_id", None),
).model_dump(),
)

View File

@@ -1,22 +1,2 @@
# Service-specific dependencies
# RDF and semantic web
rdflib>=7.2.1
pyshacl>=0.30.1
# Graph algorithms
networkx>=3.5
# Data export formats
xmltodict>=1.0.2
# Query optimization
pyparsing>=3.2.5
# Graph visualization (optional)
graphviz>=0.21
# Additional Neo4j utilities
neomodel>=5.5.3
# Cypher query building
py2neo>=2021.2.4
setuptools
pyshacl==0.23.0

View File

@@ -1,53 +1,27 @@
# Multi-stage build for svc_normalize_map
FROM python:3.12-slim AS builder
FROM python:3.12-slim-bookworm
# Install build dependencies
RUN apt-get update && apt-get install -y \
build-essential \
curl \
&& rm -rf /var/lib/apt/lists/*
# Set environment variables
ENV PYTHONUNBUFFERED 1
ENV APP_HOME /app
# Create virtual environment
RUN python -m venv /opt/venv
ENV PATH="/opt/venv/bin:$PATH"
# Create and set working directory
WORKDIR $APP_HOME
# Copy requirements and install dependencies
# Install dependencies
COPY libs/requirements-base.txt /tmp/libs-requirements.txt
COPY apps/svc_normalize_map/requirements.txt /tmp/requirements.txt
RUN pip install --no-cache-dir --upgrade pip && \
pip install --no-cache-dir -r /tmp/libs-requirements.txt -r /tmp/requirements.txt
# Production stage
FROM python:3.12-slim
# Install runtime dependencies
RUN apt-get update && apt-get install -y \
curl \
&& rm -rf /var/lib/apt/lists/* \
&& groupadd -r appuser \
&& useradd -r -g appuser appuser
# Copy virtual environment from builder
COPY --from=builder /opt/venv /opt/venv
ENV PATH="/opt/venv/bin:$PATH"
# Set working directory
WORKDIR /app
RUN pip install --no-cache-dir -r /tmp/libs-requirements.txt -r /tmp/requirements.txt
# Copy application code
COPY libs/ ./libs/
COPY apps/svc_normalize_map/ ./apps/svc_normalize_map/
# Create non-root user and set permissions
RUN chown -R appuser:appuser /app
USER appuser
# Health check
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
CMD curl -f http://localhost:8000/healthz || exit 1
# Expose port
EXPOSE 8000
# Run the application
CMD ["python", "-m", "uvicorn", "apps.svc_normalize_map.main:app", "--host", "0.0.0.0", "--port", "8000"]

View File

@@ -1,24 +1,11 @@
"""Data normalization and knowledge graph mapping."""
# FILE: apps/svc-normalize-map/main.py
# pylint: disable=wrong-import-position,import-error,too-few-public-methods,global-statement
# pylint: disable=global-variable-not-assigned,raise-missing-from,unused-argument
# pylint: disable=broad-exception-caught,no-else-return,too-many-arguments,too-many-positional-arguments
# pylint: disable=too-many-locals,import-outside-toplevel,too-many-statements
# mypy: disable-error-code=union-attr
import os
# Import shared libraries
import sys
from datetime import datetime
from decimal import Decimal
from typing import Any
from datetime import UTC, datetime
from typing import Any, cast
import structlog
import ulid
from fastapi import BackgroundTasks, Depends, HTTPException, Request
from fastapi import HTTPException, Request
from fastapi.responses import JSONResponse
sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
@@ -34,83 +21,68 @@ from libs.events import EventBus, EventPayload, EventTopics
from libs.neo import Neo4jClient
from libs.observability import get_metrics, get_tracer, setup_observability
from libs.schemas import ErrorResponse
from libs.security import get_current_user, get_tenant_id
from libs.storage import DocumentStorage, StorageClient
logger = structlog.get_logger()
class NormalizeMapSettings(BaseAppSettings):
"""Settings for normalize-map service"""
"""Settings for NormalizeMap service"""
service_name: str = "svc-normalize-map"
# Normalization configuration
currency_default: str = "GBP"
date_formats: list[str] = [
"%Y-%m-%d",
"%d/%m/%Y",
"%d-%m-%Y",
"%d %B %Y",
"%d %b %Y",
"%B %d, %Y",
]
# Mapping configuration
confidence_threshold: float = 0.7
auto_create_entities: bool = True
# Validation rules
max_amount: float = 1000000.0 # £1M
min_confidence: float = 0.5
# Create app and settings
app, settings = create_app(
service_name="svc-normalize-map",
title="Tax Agent Normalize-Map Service",
description="Data normalization and knowledge graph mapping service",
settings_class=NormalizeMapSettings,
)
# Global clients
storage_client: StorageClient | None = None
document_storage: DocumentStorage | None = None
neo4j_client: Neo4jClient | None = None
event_bus: EventBus | None = None
tracer = get_tracer("svc-normalize-map")
metrics = get_metrics()
neo4j_client: Neo4jClient | None = None
settings: NormalizeMapSettings
@app.on_event("startup")
async def startup_event() -> None:
async def init_dependencies(app_settings: NormalizeMapSettings) -> None:
"""Initialize service dependencies"""
global storage_client, document_storage, neo4j_client, event_bus
global storage_client, document_storage, event_bus, neo4j_client, settings
logger.info("Starting normalize-map service")
settings = app_settings
logger.info("Starting NormalizeMap service")
# Setup observability
setup_observability(settings)
# Initialize MinIO client
minio_client = create_minio_client(settings)
storage_client = StorageClient(minio_client)
document_storage = DocumentStorage(storage_client)
# Initialize Neo4j client
neo4j_driver = create_neo4j_client(settings)
neo4j_client = Neo4jClient(neo4j_driver)
# Initialize event bus
event_bus = create_event_bus(settings)
if not event_bus:
raise HTTPException(status_code=500, detail="Event bus not initialized")
await event_bus.start()
# Subscribe to extraction completion events
await event_bus.subscribe( # type: ignore
EventTopics.DOC_EXTRACTED, _handle_extraction_completed
)
await event_bus.subscribe(EventTopics.DOC_EXTRACTED, _handle_document_extracted)
logger.info("Normalize-map service started successfully")
logger.info("NormalizeMap service started successfully")
app, _settings = create_app(
service_name="svc-normalize-map",
title="Tax Agent Normalize and Map Service",
description="Normalize extracted data and map to Knowledge Graph",
settings_class=NormalizeMapSettings,
)
# Initialize dependencies immediately
@app.on_event("startup")
async def startup_event(): # type: ignore
await init_dependencies(cast(NormalizeMapSettings, _settings))
tracer = get_tracer("svc-normalize-map")
metrics = get_metrics()
@app.on_event("shutdown")
@@ -118,456 +90,235 @@ async def shutdown_event() -> None:
"""Cleanup service dependencies"""
global event_bus, neo4j_client
logger.info("Shutting down normalize-map service")
if neo4j_client:
await neo4j_client.close()
logger.info("Shutting down NormalizeMap service")
if event_bus:
await event_bus.stop()
logger.info("Normalize-map service shutdown complete")
if neo4j_client:
await neo4j_client.close()
logger.info("NormalizeMap service shutdown complete")
@app.get("/health")
async def health_check() -> dict[str, Any]:
"""Health check endpoint"""
return {
"status": "healthy",
"service": settings.service_name,
"version": settings.service_version,
"timestamp": datetime.utcnow().isoformat(),
}
async def _handle_document_extracted(topic: str, payload: EventPayload) -> None:
"""Handle document extracted events"""
data = payload.data
doc_id = data.get("doc_id")
tenant_id = data.get("tenant_id")
extracted_fields = data.get("extraction_results", {}).get("extracted_fields", {})
provenance = data.get("extraction_results", {}).get("provenance", [])
if not doc_id or not tenant_id or not extracted_fields:
logger.warning("Invalid document extracted event", data=data)
return
@app.post("/normalize/{doc_id}")
async def normalize_document(
doc_id: str,
background_tasks: BackgroundTasks,
current_user: dict[str, Any] = Depends(get_current_user),
tenant_id: str = Depends(get_tenant_id),
) -> dict[str, Any]:
"""Normalize and map document data to knowledge graph"""
with tracer.start_as_current_span("normalize_document") as span:
with tracer.start_as_current_span("normalize_and_map") as span:
span.set_attribute("doc_id", doc_id)
span.set_attribute("tenant_id", tenant_id)
try:
# Check if extraction results exist
extraction_results = await document_storage.get_extraction_result(
tenant_id, doc_id
)
if not extraction_results:
raise HTTPException(
status_code=404, detail="Extraction results not found"
)
# 1. Normalize data
normalized_data = await _normalize_data(extracted_fields)
# Generate normalization ID
normalization_id = str(ulid.new())
span.set_attribute("normalization_id", normalization_id)
# Start background normalization
background_tasks.add_task(
_normalize_and_map_async,
doc_id,
tenant_id,
extraction_results,
normalization_id,
current_user.get("sub", "system"),
# 2. Map to KG ontology
kg_upsert_payload = await _map_to_kg_ontology(
doc_id, tenant_id, normalized_data, provenance
)
logger.info(
"Normalization started",
doc_id=doc_id,
normalization_id=normalization_id,
# 3. Publish kg.upsert.ready event
event_payload = EventPayload(
data=kg_upsert_payload,
actor=payload.actor,
tenant_id=tenant_id,
trace_id=str(span.get_span_context().trace_id),
)
await event_bus.publish(EventTopics.KG_UPSERT_READY, event_payload) # type: ignore
return {
"normalization_id": normalization_id,
"doc_id": doc_id,
"status": "processing",
}
except HTTPException:
raise
except Exception as e:
logger.error("Failed to start normalization", doc_id=doc_id, error=str(e))
raise HTTPException(status_code=500, detail="Failed to start normalization")
async def _handle_extraction_completed(topic: str, payload: EventPayload) -> None:
"""Handle extraction completion events"""
try:
data = payload.data
doc_id = data.get("doc_id")
tenant_id = data.get("tenant_id")
confidence = data.get("confidence", 0.0)
if not doc_id or not tenant_id:
logger.warning("Invalid extraction completion event", data=data)
return
# Only auto-process if confidence is above threshold
if confidence >= settings.confidence_threshold:
logger.info(
"Auto-normalizing extracted document",
doc_id=doc_id,
confidence=confidence,
)
extraction_results = data.get("extraction_results")
if not extraction_results:
extraction_results = await document_storage.get_extraction_result(
tenant_id, doc_id
)
if extraction_results:
await _normalize_and_map_async(
doc_id=doc_id,
tenant_id=tenant_id,
extraction_results=extraction_results,
normalization_id=str(ulid.new()),
actor=payload.actor,
)
else:
logger.info(
"Skipping auto-normalization due to low confidence",
doc_id=doc_id,
confidence=confidence,
)
except Exception as e:
logger.error("Failed to handle extraction completion", error=str(e))
async def _normalize_and_map_async(
doc_id: str,
tenant_id: str,
extraction_results: dict[str, Any],
normalization_id: str,
actor: str,
) -> None:
"""Normalize and map data asynchronously"""
with tracer.start_as_current_span("normalize_and_map_async") as span:
span.set_attribute("doc_id", doc_id)
span.set_attribute("normalization_id", normalization_id)
try:
extracted_fields = extraction_results.get("extracted_fields", {})
provenance = extraction_results.get("provenance", [])
# Normalize extracted data
normalized_data = await _normalize_data(extracted_fields, provenance)
# Map to knowledge graph entities
entities = await _map_to_entities(normalized_data, doc_id, tenant_id)
# Store entities in knowledge graph
stored_entities = await _store_entities(entities, tenant_id)
# Create normalization results
normalization_results = {
"doc_id": doc_id,
"normalization_id": normalization_id,
"normalized_at": datetime.utcnow().isoformat(),
"normalized_data": normalized_data,
"entities": stored_entities,
"entity_count": len(stored_entities),
}
logger.info("Normalization completed", results=normalization_results)
# Update metrics
metrics.counter("documents_normalized_total").labels(
metrics.counter("normalized_documents_total").labels(
tenant_id=tenant_id
).inc()
metrics.histogram("entities_created").labels(tenant_id=tenant_id).observe(
len(stored_entities)
)
# Publish completion event
event_payload = EventPayload(
data={
"doc_id": doc_id,
"tenant_id": tenant_id,
"normalization_id": normalization_id,
"entity_count": len(stored_entities),
"entities": stored_entities,
},
actor=actor,
tenant_id=tenant_id,
)
await event_bus.publish(EventTopics.KG_UPSERTED, event_payload)
logger.info(
"Normalization completed", doc_id=doc_id, entities=len(stored_entities)
"Document normalized and mapped", doc_id=doc_id, tenant_id=tenant_id
)
except Exception as e:
logger.error("Normalization failed", doc_id=doc_id, error=str(e))
# Update error metrics
logger.error(
"Failed to normalize and map document", doc_id=doc_id, error=str(e)
)
metrics.counter("normalization_errors_total").labels(
tenant_id=tenant_id, error_type=type(e).__name__
).inc()
async def _normalize_data(
extracted_fields: dict[str, Any], provenance: list[dict[str, Any]]
) -> dict[str, Any]:
"""Normalize extracted data"""
normalized = {}
for field_name, raw_value in extracted_fields.items():
try:
if "amount" in field_name.lower() or "total" in field_name.lower():
normalized[field_name] = _normalize_amount(raw_value)
elif "date" in field_name.lower():
normalized[field_name] = _normalize_date(raw_value)
elif "name" in field_name.lower():
normalized[field_name] = _normalize_name(raw_value)
elif "address" in field_name.lower():
normalized[field_name] = _normalize_address(raw_value)
elif "number" in field_name.lower():
normalized[field_name] = _normalize_number(raw_value)
else:
normalized[field_name] = _normalize_text(raw_value)
except Exception as e:
logger.warning(
"Failed to normalize field",
field=field_name,
value=raw_value,
error=str(e),
)
normalized[field_name] = raw_value # Keep original value
return normalized
def _normalize_amount(value: str) -> dict[str, Any]:
"""Normalize monetary amount"""
import re
if not value:
return {"amount": None, "currency": settings.currency_default}
# Remove currency symbols and formatting
clean_value = re.sub(r"[£$€,\s]", "", str(value))
try:
amount = Decimal(clean_value)
# Validate amount
if amount > settings.max_amount:
logger.warning("Amount exceeds maximum", amount=amount)
return {
"amount": float(amount),
"currency": settings.currency_default,
"original": value,
}
except Exception:
return {
"amount": None,
"currency": settings.currency_default,
"original": value,
}
def _normalize_date(value: str) -> dict[str, Any]:
"""Normalize date"""
from dateutil import parser
if not value:
return {"date": None, "original": value}
try:
# Try parsing with dateutil first
parsed_date = parser.parse(str(value), dayfirst=True)
return {"date": parsed_date.date().isoformat(), "original": value}
except Exception:
# Try manual formats
for fmt in settings.date_formats:
async def _normalize_data(extracted_fields: dict[str, Any]) -> dict[str, Any]:
"""Normalize extracted data into a consistent format"""
normalized_data = {}
for key, value in extracted_fields.items():
# Example: Simple date normalization (can be expanded)
if "date" in key.lower() and isinstance(value, str):
try:
parsed_date = datetime.strptime(str(value), fmt)
return {"date": parsed_date.date().isoformat(), "original": value}
except Exception:
continue
return {"date": None, "original": value}
# Attempt to parse various date formats
# Add more robust date parsing logic here as needed
normalized_data[key] = datetime.fromisoformat(value).date().isoformat()
except ValueError:
normalized_data[key] = value # Keep original if parsing fails
elif "amount" in key.lower() and isinstance(value, str):
# Example: Normalize currency to a Decimal
try:
normalized_data[key] = float(value.replace("£", "").replace(",", ""))
except ValueError:
normalized_data[key] = value
else:
normalized_data[key] = value
return normalized_data
def _normalize_name(value: str) -> dict[str, Any]:
"""Normalize person/company name"""
if not value:
return {"name": None, "original": value}
async def _map_to_kg_ontology(
doc_id: str,
tenant_id: str,
normalized_data: dict[str, Any],
provenance: list[dict[str, Any]],
) -> dict[str, Any]:
"""Map normalized data to Knowledge Graph ontology nodes and relationships based on kg_schema.json"""
nodes = []
relationships = []
now = datetime.now(UTC).isoformat()
# Clean and title case
clean_name = str(value).strip().title()
# Create a Document node
doc_node_id = f"document_{doc_id}"
nodes.append(
{
"id": doc_node_id,
"type": "Document",
"properties": {
"node_type": "Document",
"doc_id": doc_id,
"kind": normalized_data.get("kind", "OtherSupportingDoc"),
"source": normalized_data.get("source", "manual_upload"),
"checksum": normalized_data.get("checksum", ""),
"valid_from": now,
"asserted_at": now,
# "source": "svc-normalize-map",
"extractor_version": "1.0.0",
},
}
)
# Detect if it's a company (contains Ltd, Limited, etc.)
company_indicators = ["Ltd", "Limited", "Plc", "Inc", "Corp", "Company"]
is_company = any(indicator in clean_name for indicator in company_indicators)
# Create a TaxpayerProfile node
taxpayer_id = normalized_data.get("taxpayer_id", "unknown_taxpayer")
taxpayer_node_id = f"taxpayer_{taxpayer_id}"
nodes.append(
{
"id": taxpayer_node_id,
"type": "TaxpayerProfile",
"properties": {
"node_type": "TaxpayerProfile",
"taxpayer_id": taxpayer_id,
"type": "Individual",
"valid_from": now,
"asserted_at": now,
"source": "svc-normalize-map",
"extractor_version": "1.0.0",
},
}
)
relationships.append(
{
"id": f"rel_document_to_taxpayer_{doc_id}",
"type": "BELONGS_TO",
"sourceId": doc_node_id,
"targetId": taxpayer_node_id,
"properties": {},
}
)
# Create IncomeItem/ExpenseItem nodes and Evidence nodes
item_type = (
"IncomeItem" if normalized_data.get("kind") == "invoice" else "ExpenseItem"
)
for field, value in normalized_data.items():
if field in ["total_amount", "net_amount", "vat_amount", "amount"]:
item_id = f"item_{ulid.new()}"
item_node_id = f"{item_type.lower()}_{item_id}"
# Create the financial item node (IncomeItem or ExpenseItem)
nodes.append(
{
"id": item_node_id,
"type": item_type,
"properties": {
"node_type": item_type,
"type": (
"self_employment"
if "invoice" in normalized_data.get("kind", "")
else "other"
),
"gross": value,
"currency": "GBP",
"description": normalized_data.get("description", field),
"valid_from": now,
"asserted_at": now,
"source": "svc-normalize-map",
"extractor_version": "1.0.0",
},
}
)
relationships.append(
{
"id": f"rel_taxpayer_has_{item_type.lower()}_{item_id}",
"type": (
"HAS_INCOME" if item_type == "IncomeItem" else "HAS_EXPENSE"
),
"sourceId": taxpayer_node_id,
"targetId": item_node_id,
"properties": {},
}
)
# Create an Evidence node linking the item to the document
prov = next((p for p in provenance if p["field"] == field), None)
if prov:
evidence_id = f"evidence_{item_id}"
nodes.append(
{
"id": evidence_id,
"type": "Evidence",
"properties": {
"node_type": "Evidence",
"snippet_id": evidence_id,
"doc_ref": doc_id,
"page": prov.get("page"),
"bbox": prov.get("bbox"),
"text_hash": "dummy_hash", # Placeholder
"ocr_confidence": prov.get("confidence"),
"extracted_text": str(value),
"valid_from": now,
"asserted_at": now,
"source": "svc-normalize-map",
"extractor_version": "1.0.0",
},
}
)
relationships.append(
{
"id": f"rel_item_supported_by_evidence_{item_id}",
"type": "SUPPORTED_BY",
"sourceId": item_node_id,
"targetId": evidence_id,
"properties": {},
}
)
return {
"name": clean_name,
"type": "company" if is_company else "person",
"original": value,
"nodes": nodes,
"relationships": relationships,
"document_id": doc_id,
"tenant_id": tenant_id,
}
def _normalize_address(value: str) -> dict[str, Any]:
"""Normalize address"""
import re
if not value:
return {"address": None, "original": value}
clean_address = str(value).strip()
# Extract UK postcode
postcode_pattern = r"\b[A-Z]{1,2}\d[A-Z0-9]?\s*\d[A-Z]{2}\b"
postcode_match = re.search(postcode_pattern, clean_address, re.IGNORECASE)
postcode = postcode_match.group().upper() if postcode_match else None
return {"address": clean_address, "postcode": postcode, "original": value}
def _normalize_number(value: str) -> dict[str, Any]:
"""Normalize reference numbers"""
import re
if not value:
return {"number": None, "original": value}
# Remove spaces and special characters
clean_number = re.sub(r"[^\w]", "", str(value))
# Detect number type
number_type = "unknown"
if len(clean_number) == 10 and clean_number.isdigit():
number_type = "utr" # UTR is 10 digits
elif len(clean_number) == 8 and clean_number.isdigit():
number_type = "account_number"
elif re.match(r"^\d{6}$", clean_number):
number_type = "sort_code"
return {"number": clean_number, "type": number_type, "original": value}
def _normalize_text(value: str) -> dict[str, Any]:
"""Normalize general text"""
if not value:
return {"text": None, "original": value}
clean_text = str(value).strip()
return {"text": clean_text, "original": value}
async def _map_to_entities(
normalized_data: dict[str, Any], doc_id: str, tenant_id: str
) -> list[dict[str, Any]]:
"""Map normalized data to knowledge graph entities"""
entities = []
# Create document entity
doc_entity = {
"type": "Document",
"id": doc_id,
"properties": {
"doc_id": doc_id,
"tenant_id": tenant_id,
"processed_at": datetime.utcnow().isoformat(),
"source": "extraction",
"extractor_version": "1.0.0",
"valid_from": datetime.utcnow(),
"asserted_at": datetime.utcnow(),
},
}
entities.append(doc_entity)
# Map specific field types to entities
for field_name, normalized_value in normalized_data.items():
if isinstance(normalized_value, dict):
if "amount" in normalized_value and normalized_value["amount"] is not None:
# Create expense or income item
entity_type = (
"ExpenseItem" if "expense" in field_name.lower() else "IncomeItem"
)
entity = {
"type": entity_type,
"id": f"{entity_type.lower()}_{ulid.new()}",
"properties": {
"amount": normalized_value["amount"],
"currency": normalized_value["currency"],
"description": field_name,
"source": doc_id,
"extractor_version": "1.0.0",
"valid_from": datetime.utcnow(),
"asserted_at": datetime.utcnow(),
},
}
entities.append(entity)
elif "name" in normalized_value and normalized_value["name"] is not None:
# Create party entity
entity = {
"type": "Party",
"id": f"party_{ulid.new()}",
"properties": {
"name": normalized_value["name"],
"party_type": normalized_value.get("type", "unknown"),
"source": doc_id,
"extractor_version": "1.0.0",
"valid_from": datetime.utcnow(),
"asserted_at": datetime.utcnow(),
},
}
entities.append(entity)
return entities
async def _store_entities(
entities: list[dict[str, Any]], tenant_id: str
) -> list[dict[str, Any]]:
"""Store entities in knowledge graph"""
stored_entities = []
for entity in entities:
try:
# Create node in Neo4j
result = await neo4j_client.create_node(
label=entity["type"], properties=entity["properties"]
)
stored_entities.append(
{
"type": entity["type"],
"id": entity["id"],
"neo4j_id": result.get("id"),
"properties": entity["properties"],
}
)
logger.debug("Entity stored", type=entity["type"], id=entity["id"])
except Exception as e:
logger.error("Failed to store entity", entity=entity, error=str(e))
return stored_entities
@app.exception_handler(HTTPException)
async def http_exception_handler(request: Request, exc: HTTPException) -> JSONResponse:
"""Handle HTTP exceptions with RFC7807 format"""
@@ -579,8 +330,8 @@ async def http_exception_handler(request: Request, exc: HTTPException) -> JSONRe
status=exc.status_code,
detail=exc.detail,
instance=str(request.url),
trace_id="",
).dict(),
trace_id=getattr(request.state, "trace_id", None),
).model_dump(),
)

View File

@@ -1,37 +1 @@
# FastAPI and server
fastapi>=0.118.3
uvicorn[standard]>=0.37.0
pydantic>=2.12.0
# Service-specific dependencies
# Data normalization and cleaning
pandas>=2.3.3
numpy>=2.3.3
# Currency and exchange rates
forex-python>=1.9.2
babel>=2.17.0
# Date and time processing
python-dateutil>=2.9.0
pytz>=2025.2
# Text normalization
unidecode>=1.4.0
phonenumbers>=9.0.16
# Entity resolution and matching
recordlinkage>=0.16.0
fuzzywuzzy>=0.18.0
python-Levenshtein>=0.27.1
# Geographic data
geopy>=2.4.1
pycountry>=24.6.1
# Data validation
cerberus>=1.3.7
marshmallow>=4.0.1
# UK-specific utilities
uk-postcode-utils>=1.1
python-ulid

View File

@@ -7,13 +7,14 @@ import os
# Import shared libraries
import sys
from contextlib import asynccontextmanager
from datetime import datetime
from typing import Any, cast
import pytesseract
import structlog
import ulid
from fastapi import BackgroundTasks, Depends, HTTPException, Request
from fastapi import BackgroundTasks, Depends, FastAPI, HTTPException, Request
from fastapi.responses import JSONResponse
from pdf2image import convert_from_bytes
from PIL import Image
@@ -78,6 +79,8 @@ settings: OCRSettings
async def init_dependencies(app_settings: OCRSettings) -> None:
"""Initialize service dependencies"""
global storage_client, document_storage, event_bus, settings, vision_processor
# Larger delay to ensure NATS is fully ready before attempting connection
await asyncio.sleep(10)
settings = app_settings
logger.info("Starting OCR service")
@@ -89,17 +92,35 @@ async def init_dependencies(app_settings: OCRSettings) -> None:
minio_client = create_minio_client(settings)
storage_client = StorageClient(minio_client)
document_storage = DocumentStorage(storage_client)
# Initialize event bus
event_bus = create_event_bus(settings)
if not event_bus:
raise HTTPException(status_code=500, detail="Event bus not initialized")
eb = event_bus
# mypy: event_bus is Optional, so use local alias after check
await eb.start()
# Subscribe to document ingestion events
await eb.subscribe(EventTopics.DOC_INGESTED, _handle_document_ingested)
# Initialize event bus with retry logic
max_retries = 20
delay = 5
for attempt in range(1, max_retries + 1):
logger.info(
"Attempting NATS connection", url=settings.nats_servers, attempt=attempt
)
event_bus = create_event_bus(settings)
if not event_bus:
raise HTTPException(status_code=500, detail="Event bus not initialized")
eb = event_bus
try:
# Attempt to start and subscribe
await eb.start()
await eb.subscribe(EventTopics.DOC_INGESTED, _handle_document_ingested)
logger.info("NATS connection established on attempt", attempt=attempt)
break
except Exception as e:
logger.error(
"Failed to connect to NATS, retrying",
attempt=attempt,
error=str(e),
)
if attempt == max_retries:
raise HTTPException(
status_code=500, detail="Failed to connect to NATS after retries"
)
await asyncio.sleep(delay)
delay *= 2 # exponential backoff
# Initialize shared OCRProcessor for vision strategy
try:
@@ -114,7 +135,26 @@ async def init_dependencies(app_settings: OCRSettings) -> None:
logger.info("OCR service started successfully")
# Create app and settings
async def shutdown_dependencies() -> None:
"""Shutdown service dependencies"""
logger.info("Shutting down OCR service")
eb = event_bus
if eb is not None:
await eb.stop()
logger.info("OCR service shutdown complete")
@asynccontextmanager
async def lifespan(app: FastAPI): # type: ignore
"""FastAPI lifespan event handler"""
# Startup
await init_dependencies(cast(OCRSettings, _settings))
yield
# Shutdown
await shutdown_dependencies()
# Create app and settings with lifespan
app, _settings = create_app(
service_name="svc-ocr",
title="Tax Agent OCR Service",
@@ -122,8 +162,8 @@ app, _settings = create_app(
settings_class=OCRSettings,
) # fmt: skip
# Initialize dependencies immediately
asyncio.run(init_dependencies(cast(OCRSettings, _settings)))
# Override app's lifespan
app.router.lifespan_context = lifespan
tracer = get_tracer("svc-ocr")
metrics = get_metrics()

View File

@@ -14,3 +14,12 @@ opencv-python-headless>=4.12.0.88 # Headless version is smaller
# Computer vision (torchvision not in base-ml)
torchvision>=0.23.0
# OpenTelemetry (required by libs/observability)
opentelemetry-api>=1.21.0
opentelemetry-sdk>=1.21.0
opentelemetry-exporter-otlp-proto-grpc>=1.21.0
opentelemetry-instrumentation-fastapi>=0.42b0
opentelemetry-instrumentation-httpx>=0.42b0
opentelemetry-instrumentation-psycopg2>=0.42b0
opentelemetry-instrumentation-redis>=0.42b0

View File

@@ -10,12 +10,15 @@ FROM ${REGISTRY}/${OWNER}/base-ml:${BASE_VERSION}
# Switch to root to install service-specific dependencies
USER root
RUN apt-get update && apt-get install -y build-essential
# Set working directory
WORKDIR /app
# Copy service-specific requirements and install
COPY libs/requirements-base.txt /tmp/libs-requirements.txt
COPY apps/svc_rag_indexer/requirements.txt /tmp/service-requirements.txt
RUN pip install --no-cache-dir -r /tmp/service-requirements.txt
RUN pip install --no-cache-dir -r /tmp/libs-requirements.txt -r /tmp/service-requirements.txt
# Copy application code
COPY libs/ ./libs/
@@ -26,7 +29,7 @@ RUN chown -R appuser:appuser /app
USER appuser
# Health check
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
HEALTHCHECK --interval=30s --timeout=10s --start-period=30s --retries=3 \
CMD curl -f http://localhost:8000/healthz || exit 1
# Expose port

View File

@@ -10,12 +10,15 @@ FROM ${REGISTRY}/${OWNER}/base-ml:${BASE_VERSION}
# Switch to root to install service-specific dependencies
USER root
RUN apt-get update && apt-get install -y build-essential
# Set working directory
WORKDIR /app
# Copy service-specific requirements and install
COPY libs/requirements-base.txt /tmp/libs-requirements.txt
COPY apps/svc_rag_retriever/requirements.txt /tmp/service-requirements.txt
RUN pip install --no-cache-dir -r /tmp/service-requirements.txt
RUN pip install --no-cache-dir -r /tmp/libs-requirements.txt -r /tmp/service-requirements.txt
# Copy application code
COPY libs/ ./libs/

View File

@@ -43,7 +43,7 @@ RUN chown -R appuser:appuser /app
USER appuser
# Health check
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
HEALTHCHECK --interval=30s --timeout=10s --start-period=30s --retries=3 \
CMD curl -f http://localhost:8000/healthz || exit 1
# Expose port

View File

@@ -17,6 +17,7 @@ from datetime import datetime
from decimal import Decimal
from typing import Any
import httpx
import structlog
import ulid
from fastapi import BackgroundTasks, Depends, HTTPException, Request
@@ -55,6 +56,9 @@ class ReasonSettings(BaseAppSettings):
max_income: float = 10000000.0 # £10M
max_expenses: float = 10000000.0 # £10M
# External services
coverage_service_url: str = "http://svc-coverage:8000"
# Create app and settings
app, settings = create_app(
@@ -67,6 +71,7 @@ app, settings = create_app(
# Global clients
neo4j_client: Neo4jClient | None = None
event_bus: EventBus | None = None
http_client: httpx.AsyncClient | None = None
tracer = get_tracer("svc-reason")
metrics = get_metrics()
@@ -74,7 +79,7 @@ metrics = get_metrics()
@app.on_event("startup")
async def startup_event() -> None:
"""Initialize service dependencies"""
global neo4j_client, event_bus
global neo4j_client, event_bus, http_client
logger.info("Starting reasoning service")
@@ -89,6 +94,9 @@ async def startup_event() -> None:
event_bus = create_event_bus(settings)
await event_bus.start() # fmt: skip# pyright: ignore[reportOptionalMemberAccess]
# Initialize HTTP client
http_client = httpx.AsyncClient()
# Subscribe to KG upsert events
await event_bus.subscribe(EventTopics.KG_UPSERTED, _handle_kg_upserted) # type: ignore
@@ -98,7 +106,7 @@ async def startup_event() -> None:
@app.on_event("shutdown")
async def shutdown_event() -> None:
"""Cleanup service dependencies"""
global neo4j_client, event_bus
global neo4j_client, event_bus, http_client
logger.info("Shutting down reasoning service")
@@ -108,6 +116,9 @@ async def shutdown_event() -> None:
if event_bus:
await event_bus.stop()
if http_client:
await http_client.aclose()
logger.info("Reasoning service shutdown complete")
@@ -259,41 +270,76 @@ async def get_calculation_results(
async def _handle_kg_upserted(topic: str, payload: EventPayload) -> None:
"""Handle KG upsert events for auto-calculation"""
"""Handle KG upsert events for auto-calculation and coverage check"""
data = payload.data
taxpayer_id = data.get("taxpayer_id")
tax_year = data.get("tax_year")
tenant_id = data.get("tenant_id")
if not taxpayer_id or not tax_year or not tenant_id:
logger.warning("Invalid KG upsert event data for coverage check", data=data)
return
# Trigger svc-coverage check
try:
data = payload.data
entities = data.get("entities", [])
tenant_id = data.get("tenant_id")
# Check if we have enough data for calculation
has_income = any(e.get("type") == "IncomeItem" for e in entities)
has_expenses = any(e.get("type") == "ExpenseItem" for e in entities)
if has_income or has_expenses:
if http_client:
coverage_url = f"{settings.coverage_service_url}/v1/coverage/check"
request_body = {
"tax_year": tax_year,
"taxpayer_id": taxpayer_id,
}
headers = {
"X-Tenant-ID": tenant_id,
# Assuming current_user is not directly available here,
# or a system user token needs to be generated.
# For now, omitting X-Authenticated-User for simplicity,
# but in a real system, this should be handled securely.
}
response = await http_client.post(coverage_url, json=request_body, headers=headers)
response.raise_for_status()
coverage_report = response.json()
logger.info(
"Auto-triggering calculation due to new financial data",
tenant_id=tenant_id,
"Triggered svc-coverage check",
taxpayer_id=taxpayer_id,
tax_year=tax_year,
coverage_status=coverage_report.get("overall_status"),
)
# Find taxpayer ID from entities
taxpayer_id = None
for entity in entities:
if entity.get("type") == "TaxpayerProfile":
taxpayer_id = entity.get("id")
break
if taxpayer_id:
# If coverage is complete, trigger calculation
if coverage_report.get("overall_status") == "complete":
logger.info(
"Coverage complete, auto-triggering calculation",
taxpayer_id=taxpayer_id,
tax_year=tax_year,
)
await _compute_schedule_async(
tax_year=settings.current_tax_year,
tax_year=tax_year,
taxpayer_id=taxpayer_id,
schedule_id="SA103", # Default to self-employment
tenant_id=tenant_id or "",
tenant_id=tenant_id,
calculation_id=str(ulid.new()),
actor=payload.actor,
)
else:
logger.info(
"Coverage incomplete, not triggering calculation",
taxpayer_id=taxpayer_id,
tax_year=tax_year,
blocking_items=coverage_report.get("blocking_items"),
)
except httpx.HTTPStatusError as e:
logger.error(
"Failed to trigger svc-coverage check due to HTTP error",
taxpayer_id=taxpayer_id,
tax_year=tax_year,
error=str(e),
response_status_code=e.response.status_code,
response_text=e.response.text,
)
except Exception as e:
logger.error("Failed to handle KG upsert for auto-calculation", error=str(e))
logger.error("Failed to handle KG upsert for auto-calculation or coverage check", error=str(e))
async def _compute_schedule_async(
@@ -570,16 +616,107 @@ async def _compute_sa105(
async def _compute_sa100(
financial_data: dict[str, Any], tax_year: str
) -> tuple[dict[str, Any], list[dict[str, Any]]]:
"""Compute SA100 (Main return) schedule"""
# This would aggregate from other schedules
# For now, return basic structure
form_boxes = {
"1": {"value": "John Doe", "description": "Your name", "confidence": 0.9}
}
"""Compute SA100 (Main return) schedule by aggregating other schedules"""
form_boxes = {}
evidence_trail: list[dict[str, Any]] = []
taxpayer_id = financial_data.get("taxpayer_id")
tenant_id = financial_data.get("tenant_id") # Assuming tenant_id is passed in financial_data
if not taxpayer_id or not tenant_id:
raise ValueError("Taxpayer ID or Tenant ID missing for SA100 computation")
# Get latest SA103 calculation
sa103_query = """
MATCH (t:TaxpayerProfile {taxpayer_id: $taxpayer_id, tenant_id: $tenant_id})-[:HAS_CALCULATION]->(c:Calculation)
WHERE c.schedule = 'SA103' AND c.tax_year = $tax_year AND c.retracted_at IS NULL
OPTIONAL MATCH (c)-[:HAS_BOX]->(b:FormBox)
RETURN c.calculation_id AS calculation_id, c.calculated_at AS calculated_at, COLLECT({box: b.box, value: b.value, description: b.description, confidence: b.confidence}) AS form_boxes
ORDER BY c.calculated_at DESC
LIMIT 1
"""
sa103_results = await neo4j_client.run_query( # type: ignore
sa103_query, {"taxpayer_id": taxpayer_id, "tenant_id": tenant_id, "tax_year": tax_year}
)
sa103_calc = sa103_results[0] if sa103_results else None
sa103_net_profit = Decimal("0")
if sa103_calc and sa103_calc["form_boxes"]:
for box in sa103_calc["form_boxes"]:
if box["box"] == "32": # Net profit box in SA103
sa103_net_profit = Decimal(str(box["value"]))
form_boxes["SA103_32"] = {"value": float(sa103_net_profit), "description": "SA103 Net Profit", "confidence": box.get("confidence", 0.9)}
evidence_trail.append({
"box": "SA103_32",
"source_calculation_id": sa103_calc["calculation_id"],
"description": "Derived from SA103 Net Profit"
})
break
# Get latest SA105 calculation
sa105_query = """
MATCH (t:TaxpayerProfile {taxpayer_id: $taxpayer_id, tenant_id: $tenant_id})-[:HAS_CALCULATION]->(c:Calculation)
WHERE c.schedule = 'SA105' AND c.tax_year = $tax_year AND c.retracted_at IS NULL
OPTIONAL MATCH (c)-[:HAS_BOX]->(b:FormBox)
RETURN c.calculation_id AS calculation_id, c.calculated_at AS calculated_at, COLLECT({box: b.box, value: b.value, description: b.description, confidence: b.confidence}) AS form_boxes
ORDER BY c.calculated_at DESC
LIMIT 1
"""
sa105_results = await neo4j_client.run_query( # type: ignore
sa105_query, {"taxpayer_id": taxpayer_id, "tenant_id": tenant_id, "tax_year": tax_year}
)
sa105_calc = sa105_results[0] if sa105_results else None
sa105_net_income = Decimal("0")
if sa105_calc and sa105_calc["form_boxes"]:
for box in sa105_calc["form_boxes"]:
if box["box"] == "net_income": # Net property income box in SA105 (custom box for internal calculation)
sa105_net_income = Decimal(str(box["value"]))
form_boxes["SA105_net_income"] = {"value": float(sa105_net_income), "description": "SA105 Net Property Income", "confidence": box.get("confidence", 0.9)}
evidence_trail.append({
"box": "SA105_net_income",
"source_calculation_id": sa105_calc["calculation_id"],
"description": "Derived from SA105 Net Property Income"
})
break
# Aggregate total income for SA100
total_income = sa103_net_profit + sa105_net_income
form_boxes["SA100_total_income"] = {
"value": float(total_income),
"description": "Total income from all sources",
"confidence": 0.95 # Higher confidence for aggregated value
}
evidence_trail.append({
"box": "SA100_total_income",
"derived_from": ["SA103_32", "SA105_net_income"],
"description": "Aggregated from SA103 net profit and SA105 net property income"
})
# Example: Basic personal allowance (simplified)
personal_allowance = Decimal("12570") # For 2023-24
if total_income > Decimal("100000"): # Tapering not implemented here
personal_allowance = Decimal("0")
form_boxes["SA100_personal_allowance"] = {
"value": float(personal_allowance),
"description": "Personal Allowance",
"confidence": 0.99
}
evidence_trail.append({
"box": "SA100_personal_allowance",
"source": "HMRC_guidance",
"description": f"Standard personal allowance for {tax_year}"
})
# Placeholder for actual SA100 boxes and complex calculations
# This would involve detailed tax band calculations, reliefs, etc.
# For now, we'll just show the aggregation.
form_boxes["1"] = {"value": "John Doe (Aggregated)", "description": "Your name", "confidence": 0.9}
return form_boxes, evidence_trail

View File

@@ -33,3 +33,4 @@ jinja2>=3.1.6
# Statistical calculations
scipy>=1.16.2
httpx

View File

@@ -42,8 +42,8 @@ Deliver a complete, implementable solution—ontology, extraction pipeline, RAG+
2. **svc-rpa** — Playwright RPA for firm/client portals; Prefect-scheduled; emits `doc.ingested`.
3. **svc-ocr** — Tesseract (local) or Textract (scale); de-skew/rotation/layout; emits `doc.ocr_ready`.
4. **svc-extract** — LLM + rules + table detectors → **schema-constrained JSON** (kv + tables + bbox/page); emits `doc.extracted`.
5. **svc-normalize-map**normalize currency/dates; entity resolution; assign tax year; map to KG nodes/edges with **Evidence** anchors; emits `kg.upserted`.
6. **svc-kg**Neo4j DDL + **SHACL** validation; **bitemporal** writes `{valid_from, valid_to, asserted_at}`; RDF export.
5. **svc-normalize-map**Consumes `doc.extracted` events; normalizes extracted data (currencies, dates); performs entity resolution; assigns tax year; maps to KG nodes/edges with **Evidence** anchors; emits `kg.upsert.ready` events.
6. **svc-kg**Consumes `kg.upsert.ready` events; performs Neo4j DDL operations + **SHACL** validation; **bitemporal** writes `{valid_from, valid_to, asserted_at}`; RDF export; emits `kg.upserted` events.
7. **svc-rag-indexer** — chunk/de-identify/embed; upsert **Qdrant** collections (firm knowledge, legislation, best practices, glossary).
8. **svc-rag-retriever****hybrid retrieval** (dense + sparse) + rerank + **KG-fusion**; returns chunks + citations + KG join hints.
9. **svc-reason** — deterministic calculators (employment, self-employment, property, dividends/interest, allowances, NIC, HICBC, student loans); Cypher materializers; explanations.
@@ -51,11 +51,12 @@ Deliver a complete, implementable solution—ontology, extraction pipeline, RAG+
11. **svc-hmrc** — submit stub|sandbox|live; rate-limit & retries; submission audit.
12. **svc-firm-connectors** — read-only connectors to Firm Databases; sync to **Secure Client Data Store** with lineage.
13. **ui-review** — Next.js reviewer portal (SSO via Traefik+Authentik); reviewers accept/override extractions.
14. **svc-coverage** — Evaluates document coverage against policies, identifies gaps, and generates clarifying questions.
## Orchestration & Messaging
- **Prefect 2.x** for local orchestration; **Temporal** for production scale (sagas, retries, idempotency).
- Events: Kafka (or SQS/SNS) — `doc.ingested`, `doc.ocr_ready`, `doc.extracted`, `kg.upserted`, `rag.indexed`, `calc.schedule_ready`, `form.filled`, `hmrc.submitted`, `review.requested`, `review.completed`, `firm.sync.completed`.
- Events: Kafka (or SQS/SNS) — `doc.ingested`, `doc.ocr_ready`, `doc.extracted`, `kg.upsert.ready`, `kg.upserted`, `rag.indexed`, `calc.schedule_ready`, `form.filled`, `hmrc.submitted`, `review.requested`, `review.completed`, `firm.sync.completed`.
## Concrete Stack (pin/assume unless replaced)
@@ -103,7 +104,7 @@ repo/
svc-ingestion/ svc-rpa/ svc-ocr/ svc-extract/
svc-normalize-map/ svc-kg/ svc-rag-indexer/ svc-rag-retriever/
svc-reason/ svc-forms/ svc-hmrc/ svc-firm-connectors/
ui-review/
svc-coverage/ ui-review/
kg/
ONTOLOGY.md
schemas/{nodes_and_edges.schema.json, context.jsonld, shapes.ttl}

View File

@@ -7,6 +7,7 @@ This guide explains how to run services locally for development.
### Prerequisites
1. **Infrastructure Services Running**: Ensure Docker Compose infrastructure is running:
```bash
make deploy-infra
```
@@ -39,17 +40,17 @@ DISABLE_AUTH=true cd apps/svc_ingestion && uvicorn main:app --reload --host 0.0.
### Environment Variables for Development
| Variable | Description | Default | Dev Value |
|----------|-------------|---------|-----------|
| `DISABLE_AUTH` | Disable authentication middleware | `false` | `true` |
| `DEV_MODE` | Enable development mode | `false` | `true` |
| `VAULT_ADDR` | Vault server address | `http://vault:8200` | - |
| `VAULT_TOKEN` | Vault token (dev only) | - | `root` |
| `MINIO_ENDPOINT` | MinIO endpoint | `minio:9000` | `minio:9092` |
| `POSTGRES_URL` | PostgreSQL connection URL | - | `postgresql://postgres:postgres@localhost:5432/tax_system` |
| `REDIS_URL` | Redis connection URL | `redis://redis:6379` | `redis://localhost:6379` |
| `NEO4J_URI` | Neo4j connection URI | `bolt://neo4j:7687` | `bolt://localhost:7687` |
| `NATS_SERVERS` | NATS server URLs | `nats://nats:4222` | `nats://localhost:4222` |
| Variable | Description | Default | Dev Value |
| ---------------- | --------------------------------- | -------------------- | ---------------------------------------------------------- |
| `DISABLE_AUTH` | Disable authentication middleware | `false` | `true` |
| `DEV_MODE` | Enable development mode | `false` | `true` |
| `VAULT_ADDR` | Vault server address | `http://vault:8200` | - |
| `VAULT_TOKEN` | Vault token (dev only) | - | `root` |
| `MINIO_ENDPOINT` | MinIO endpoint | `minio:9000` | `minio:9092` |
| `POSTGRES_URL` | PostgreSQL connection URL | - | `postgresql://postgres:postgres@localhost:5432/tax_system` |
| `REDIS_URL` | Redis connection URL | `redis://redis:6379` | `redis://localhost:6379` |
| `NEO4J_URI` | Neo4j connection URI | `bolt://neo4j:7687` | `bolt://localhost:7687` |
| `NATS_SERVERS` | NATS server URLs | `nats://nats:4222` | `nats://localhost:4222` |
### Testing with Postman
@@ -68,6 +69,7 @@ Authorization: Bearer dev-token-12345
#### With Development Mode (DISABLE_AUTH=true)
No authentication headers required! The middleware automatically sets:
- User: `dev-user`
- Email: `dev@example.com`
- Roles: `["developers"]`
@@ -123,17 +125,20 @@ Create a Postman environment called "AI Tax Agent - Dev":
### Example Requests
#### Health Check
```bash
curl http://localhost:8000/healthz
```
#### Upload Document (Development Mode)
```bash
curl -X POST http://localhost:8000/upload \
-F "file=@/path/to/document.pdf"
```
#### Upload Document (Production Mode)
```bash
curl -X POST http://localhost:8000/upload \
-H "X-Authenticated-User: dev-user" \
@@ -145,41 +150,47 @@ curl -X POST http://localhost:8000/upload \
### Debugging
#### Check Service Logs
```bash
# Local development
# Logs appear in terminal where service is running
# Docker Compose
docker-compose -f infra/compose/docker-compose.local.yml logs -f svc-ingestion
docker compose logs -f svc-ingestion
```
#### Verify Infrastructure Services
```bash
# Check all services status
docker-compose -f infra/compose/docker-compose.local.yml ps
docker compose ps
# Check specific service health
docker-compose -f infra/compose/docker-compose.local.yml exec postgres pg_isready
docker-compose -f infra/compose/docker-compose.local.yml exec redis redis-cli ping
docker-compose -f infra/compose/docker-compose.local.yml exec minio mc --version
docker compose exec postgres pg_isready
docker compose exec redis redis-cli ping
docker compose exec minio mc --version
```
#### Common Issues
**Issue**: `401 Unauthorized` errors
- **Solution**: Set `DISABLE_AUTH=true` when running locally, or add authentication headers
**Issue**: `Connection refused` to database/redis/etc
- **Solution**: Ensure infrastructure services are running with `make deploy-infra`
- **Solution**: Use `localhost` instead of service names when running locally
**Issue**: `Module not found` errors
- **Solution**: Ensure you're running from project root and virtual environment is activated
- **Solution**: Install dependencies: `pip install -r apps/SERVICE_NAME/requirements.txt -r libs/requirements.txt`
### Hot Reload
When running with `uvicorn --reload`, the service automatically reloads when you save changes to:
- Python files in `apps/SERVICE_NAME/`
- Python files in `libs/`
@@ -210,7 +221,7 @@ DISABLE_AUTH=true cd apps/svc_extract && uvicorn main:app --reload --host 0.0.0.
All Docker Compose services are configured with health checks and should show as `healthy`:
```bash
$ docker-compose -f infra/compose/docker-compose.local.yml ps
$ docker compose ps
NAME STATUS
authentik-db Up 35 hours (healthy)
authentik-outpost Up 35 hours (healthy)
@@ -237,4 +248,3 @@ vault Up 35 hours
- See [README.md](README.md) for architecture overview
- See [TESTING.md](TESTING.md) for testing guidelines (if available)
- See service-specific README files in `apps/SERVICE_NAME/` directories

View File

@@ -6,22 +6,23 @@ This document compares the local development environment with the production env
## Quick Reference
| Aspect | Local Development | Production |
|--------|------------------|------------|
| **Domain** | `*.local.lan` | `*.harkon.co.uk` |
| **SSL** | Self-signed certificates | Let's Encrypt (GoDaddy DNS) |
| **Networks** | `ai-tax-agent-frontend`<br/>`ai-tax-agent-backend` | `frontend`<br/>`backend` |
| **Compose File** | `docker-compose.local.yml` | `infrastructure.yaml`<br/>`services.yaml`<br/>`monitoring.yaml` |
| **Location** | Local machine | `deploy@141.136.35.199:/opt/ai-tax-agent/` |
| **Traefik** | Isolated instance | Shared with company services |
| **Authentik** | Isolated instance | Shared with company services |
| **Data Persistence** | Local Docker volumes | Remote Docker volumes + backups |
| Aspect | Local Development | Production |
| -------------------- | -------------------------------------------------- | --------------------------------------------------------------- |
| **Domain** | `*.local.lan` | `*.harkon.co.uk` |
| **SSL** | Self-signed certificates | Let's Encrypt (GoDaddy DNS) |
| **Networks** | `ai-tax-agent-frontend`<br/>`ai-tax-agent-backend` | `frontend`<br/>`backend` |
| **Compose File** | `compose.yaml` | `infrastructure.yaml`<br/>`services.yaml`<br/>`monitoring.yaml` |
| **Location** | Local machine | `deploy@141.136.35.199:/opt/ai-tax-agent/` |
| **Traefik** | Isolated instance | Shared with company services |
| **Authentik** | Isolated instance | Shared with company services |
| **Data Persistence** | Local Docker volumes | Remote Docker volumes + backups |
## Detailed Comparison
### 1. Domain & URLs
#### Local Development
```
Frontend:
- Review UI: https://review.local.lan
@@ -42,6 +43,7 @@ Admin Interfaces:
```
#### Production
```
Frontend:
- Review UI: https://app.harkon.co.uk
@@ -69,6 +71,7 @@ Company Services (shared):
### 2. SSL/TLS Configuration
#### Local Development
- **Certificate Type**: Self-signed
- **Generation**: `scripts/generate-dev-certs.sh`
- **Location**: `infra/compose/certs/local.crt`, `infra/compose/certs/local.key`
@@ -76,6 +79,7 @@ Company Services (shared):
- **Renewal**: Manual (when expired)
#### Production
- **Certificate Type**: Let's Encrypt
- **Challenge**: DNS-01 (GoDaddy)
- **Location**: `/opt/compose/traefik/certs/godaddy-acme.json`
@@ -85,6 +89,7 @@ Company Services (shared):
### 3. Network Configuration
#### Local Development
```yaml
networks:
frontend:
@@ -96,12 +101,14 @@ networks:
```
**Creation**:
```bash
docker network create ai-tax-agent-frontend
docker network create ai-tax-agent-backend
```
#### Production
```yaml
networks:
frontend:
@@ -117,12 +124,14 @@ networks:
### 4. Service Isolation
#### Local Development
- **Traefik**: Dedicated instance for AI Tax Agent
- **Authentik**: Dedicated instance for AI Tax Agent
- **Isolation**: Complete - no shared services
- **Impact**: Changes don't affect other services
#### Production
- **Traefik**: Shared with company services
- **Authentik**: Shared with company services
- **Isolation**: Partial - infrastructure shared, application isolated
@@ -131,12 +140,14 @@ networks:
### 5. Authentication & Authorization
#### Local Development
- **Bootstrap Admin**: `admin@local.lan` / `admin123`
- **Groups**: Auto-created via bootstrap
- **OAuth Clients**: Auto-configured
- **Users**: Test users only
#### Production
- **Bootstrap Admin**: Real admin credentials
- **Groups**:
- `company` - Company services access
@@ -149,6 +160,7 @@ networks:
### 6. Data Persistence
#### Local Development
```bash
# Volume location
/var/lib/docker/volumes/
@@ -168,6 +180,7 @@ networks:
**Retention**: Until `make clean`
#### Production
```bash
# Volume location
/var/lib/docker/volumes/
@@ -188,6 +201,7 @@ networks:
### 7. Environment Variables
#### Local Development (`.env`)
```bash
DOMAIN=local.lan
EMAIL=admin@local.lan
@@ -200,6 +214,7 @@ DEVELOPMENT_MODE=true
```
#### Production (`.env.production`)
```bash
DOMAIN=harkon.co.uk
EMAIL=admin@harkon.co.uk
@@ -214,11 +229,13 @@ DEVELOPMENT_MODE=false
### 8. Resource Limits
#### Local Development
- **No limits**: Uses available resources
- **Suitable for**: Development and testing
- **Scaling**: Not configured
#### Production
```yaml
# Example resource limits
services:
@@ -226,22 +243,24 @@ services:
deploy:
resources:
limits:
cpus: '1.0'
cpus: "1.0"
memory: 1G
reservations:
cpus: '0.5'
cpus: "0.5"
memory: 512M
```
### 9. Logging & Monitoring
#### Local Development
- **Logs**: Docker logs (`docker compose logs`)
- **Retention**: Until container restart
- **Monitoring**: Optional (Grafana available but not required)
- **Alerts**: Disabled
#### Production
- **Logs**: Centralized in Loki
- **Retention**: 30 days
- **Monitoring**: Required (Prometheus + Grafana)
@@ -250,6 +269,7 @@ services:
### 10. Deployment Process
#### Local Development
```bash
# Start everything
make bootstrap
@@ -259,7 +279,7 @@ make up
./scripts/create-networks.sh
./scripts/generate-dev-certs.sh
cd infra/compose
docker compose -f docker-compose.local.yml up -d
docker compose up -d
# Stop everything
make down
@@ -269,6 +289,7 @@ make clean
```
#### Production
```bash
# Deploy infrastructure
cd /opt/ai-tax-agent
@@ -287,11 +308,13 @@ docker compose -f services.yaml up -d --no-deps svc-ingestion
### 11. Database Migrations
#### Local Development
- **Automatic**: Migrations run on startup
- **Rollback**: `make clean` and restart
- **Data Loss**: Acceptable
#### Production
- **Manual**: Migrations run explicitly
- **Rollback**: Requires backup restoration
- **Data Loss**: NOT acceptable
@@ -299,11 +322,13 @@ docker compose -f services.yaml up -d --no-deps svc-ingestion
### 12. Secrets Management
#### Local Development
- **Storage**: `.env` file (committed to git as example)
- **Vault**: Dev mode (unsealed automatically)
- **Security**: Low (development only)
#### Production
- **Storage**: `.env.production` (NOT committed to git)
- **Vault**: Production mode (manual unseal required)
- **Security**: High (encrypted, access controlled)
@@ -311,11 +336,13 @@ docker compose -f services.yaml up -d --no-deps svc-ingestion
### 13. CI/CD Integration
#### Local Development
- **CI/CD**: Not applicable
- **Testing**: Manual
- **Deployment**: Manual
#### Production
- **CI/CD**: Gitea Actions (planned)
- **Testing**: Automated (unit, integration, e2e)
- **Deployment**: Automated with approval gates
@@ -323,12 +350,14 @@ docker compose -f services.yaml up -d --no-deps svc-ingestion
### 14. Backup & Recovery
#### Local Development
- **Backup**: Not configured
- **Recovery**: Rebuild from scratch
- **RTO**: N/A
- **RPO**: N/A
#### Production
- **Backup**: Daily automated backups
- **Recovery**: Restore from backup
- **RTO**: 1 hour
@@ -337,11 +366,13 @@ docker compose -f services.yaml up -d --no-deps svc-ingestion
### 15. Cost Considerations
#### Local Development
- **Infrastructure**: Free (local machine)
- **Compute**: Uses local resources
- **Storage**: Uses local disk
#### Production
- **Infrastructure**: Server rental (~$50/month)
- **Compute**: Shared with company services
- **Storage**: Included in server
@@ -353,16 +384,19 @@ docker compose -f services.yaml up -d --no-deps svc-ingestion
### From Local to Production
1. **Build images locally**:
```bash
docker compose -f docker-compose.local.yml build
docker compose build
```
2. **Tag for production**:
```bash
docker tag svc-ingestion:latest gitea.harkon.co.uk/ai-tax-agent/svc-ingestion:v1.0.0
```
3. **Push to registry**:
```bash
docker push gitea.harkon.co.uk/ai-tax-agent/svc-ingestion:v1.0.0
```
@@ -378,23 +412,26 @@ docker compose -f services.yaml up -d --no-deps svc-ingestion
### From Production to Local (for debugging)
1. **Pull production image**:
```bash
docker pull gitea.harkon.co.uk/ai-tax-agent/svc-ingestion:v1.0.0
```
2. **Tag for local use**:
```bash
docker tag gitea.harkon.co.uk/ai-tax-agent/svc-ingestion:v1.0.0 svc-ingestion:latest
```
3. **Run locally**:
```bash
docker compose -f docker-compose.local.yml up -d svc-ingestion
docker compose up -d svc-ingestion
```
## Best Practices
### Local Development
1. ✅ Use `make` commands for consistency
2. ✅ Keep `.env` file updated from `env.example`
3. ✅ Run tests before committing
@@ -402,6 +439,7 @@ docker compose -f services.yaml up -d --no-deps svc-ingestion
5. ✅ Clean up regularly with `make clean`
### Production
1. ✅ Never commit `.env.production` to git
2. ✅ Always backup before making changes
3. ✅ Test in local environment first
@@ -413,12 +451,14 @@ docker compose -f services.yaml up -d --no-deps svc-ingestion
## Troubleshooting
### Local Development Issues
- **Port conflicts**: Check if ports 80, 443, 8080 are in use
- **Network errors**: Recreate networks with `make networks`
- **Certificate errors**: Regenerate with `./scripts/generate-dev-certs.sh`
- **Service won't start**: Check logs with `docker compose logs <service>`
### Production Issues
- **Service unreachable**: Check Traefik routing and DNS
- **Authentication fails**: Verify Authentik configuration
- **SSL errors**: Check certificate renewal in Traefik

View File

@@ -8,9 +8,10 @@ Successfully integrated NATS.io message broker with JetStream support into the A
### 1. Added NATS Service to Docker Compose
**File**: `infra/compose/docker-compose.local.yml`
**File**: `infra/compose/compose.yaml`
#### NATS Service Configuration:
```yaml
nats:
image: nats:2.10-alpine
@@ -19,9 +20,9 @@ nats:
networks:
- backend
ports:
- "4222:4222" # NATS client connections
- "8222:8222" # HTTP monitoring
- "6222:6222" # Cluster routing (for future clustering)
- "4222:4222" # NATS client connections
- "8222:8222" # HTTP monitoring
- "6222:6222" # Cluster routing (for future clustering)
volumes:
- nats_data:/data
command: >
@@ -33,7 +34,15 @@ nats:
environment:
NATS_LOG_LEVEL: ${NATS_LOG_LEVEL:-info}
healthcheck:
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:8222/healthz"]
test:
[
"CMD",
"wget",
"--no-verbose",
"--tries=1",
"--spider",
"http://localhost:8222/healthz",
]
interval: 30s
timeout: 10s
retries: 3
@@ -47,6 +56,7 @@ nats:
```
#### Key Features:
- **JetStream Enabled**: Persistent messaging with file-based storage
- **Monitoring**: HTTP monitoring interface on port 8222
- **Cluster Ready**: Port 6222 configured for future clustering
@@ -63,6 +73,7 @@ Added `nats_data:` volume to the volumes section for persistent storage.
Updated **13 application services** to include NATS configuration:
#### Services Updated:
1. `svc-ingestion`
2. `svc-extract`
3. `svc-kg`
@@ -78,6 +89,7 @@ Updated **13 application services** to include NATS configuration:
13. `svc-rpa`
#### Environment Variables Added to Each Service:
```yaml
environment:
# ... existing variables ...
@@ -95,6 +107,7 @@ depends_on:
**File**: `infra/compose/env.example`
Added NATS configuration variables:
```bash
# Event Bus Configuration
EVENT_BUS_TYPE=memory
@@ -119,18 +132,20 @@ cd infra/compose
cp env.example .env
# Start all services including NATS
docker-compose -f docker-compose.local.yml up -d
docker compose up -d
# Check NATS status
docker-compose -f docker-compose.local.yml logs nats
docker compose logs nats
```
### Using NATS in Applications
#### Option 1: Environment Variable Configuration
Set `EVENT_BUS_TYPE=nats` in your environment to use NATS instead of memory/kafka.
#### Option 2: Direct Configuration
```python
from libs.events import create_event_bus
@@ -177,17 +192,18 @@ nats --server=nats://localhost:4222 stream info TAX_AGENT_EVENTS
### Environment Variables
| Variable | Default | Description |
|----------|---------|-------------|
| `NATS_SERVERS` | `nats://nats:4222` | NATS server connection string |
| `NATS_STREAM_NAME` | `TAX_AGENT_EVENTS` | JetStream stream name |
| `NATS_CONSUMER_GROUP` | `tax-agent` | Consumer group name |
| `NATS_LOG_LEVEL` | `info` | NATS server log level |
| `EVENT_BUS_TYPE` | `memory` | Event bus type (memory/kafka/nats) |
| Variable | Default | Description |
| --------------------- | ------------------ | ---------------------------------- |
| `NATS_SERVERS` | `nats://nats:4222` | NATS server connection string |
| `NATS_STREAM_NAME` | `TAX_AGENT_EVENTS` | JetStream stream name |
| `NATS_CONSUMER_GROUP` | `tax-agent` | Consumer group name |
| `NATS_LOG_LEVEL` | `info` | NATS server log level |
| `EVENT_BUS_TYPE` | `memory` | Event bus type (memory/kafka/nats) |
### NATS Server Configuration
The NATS server is configured with:
- **JetStream**: Enabled for persistent messaging
- **File Storage**: 10GB maximum
- **Memory Storage**: 1GB maximum
@@ -219,26 +235,31 @@ The NATS server is configured with:
## Benefits
### 1. **High Performance**
- Very low latency messaging
- High throughput with minimal overhead
- Efficient binary protocol
### 2. **Operational Simplicity**
- Single binary deployment
- Minimal configuration required
- Built-in monitoring and health checks
### 3. **Reliability**
- JetStream provides persistence
- Automatic message acknowledgment
- Configurable retry policies
### 4. **Scalability**
- Ready for clustering (port 6222 configured)
- Horizontal scaling support
- Load balancing across consumers
### 5. **Integration**
- Seamless integration with existing services
- Traefik routing for web UI
- Authentik authentication for monitoring
@@ -246,27 +267,30 @@ The NATS server is configured with:
## Next Steps
1. **Test the Integration**:
```bash
# Start the stack
docker-compose -f docker-compose.local.yml up -d
docker compose up -d
# Check NATS is running
docker-compose -f docker-compose.local.yml ps nats
docker compose ps nats
# View NATS logs
docker-compose -f docker-compose.local.yml logs nats
docker compose logs nats
```
2. **Switch to NATS**:
```bash
# Update environment
echo "EVENT_BUS_TYPE=nats" >> .env
# Restart services
docker-compose -f docker-compose.local.yml restart
docker compose restart
```
3. **Monitor Usage**:
- Access monitoring at `https://nats.local`
- Use NATS CLI for detailed monitoring
- Check application logs for event processing

View File

@@ -20,16 +20,16 @@ curl http://localhost:8000/healthz
```bash
# Start all services
cd infra/compose
docker-compose -f docker-compose.local.yml up -d
docker compose up -d
# Check status
docker-compose -f docker-compose.local.yml ps
docker compose ps
# View logs
docker-compose -f docker-compose.local.yml logs -f svc-ingestion
docker compose logs -f svc-ingestion
# Stop all services
docker-compose -f docker-compose.local.yml down
docker compose down
```
## 🔍 Checking Status
@@ -39,13 +39,13 @@ docker-compose -f docker-compose.local.yml down
```bash
# Check all services
cd infra/compose
docker-compose -f docker-compose.local.yml ps
docker compose ps
# Count healthy services
docker-compose -f docker-compose.local.yml ps | grep -c "healthy"
docker compose ps | grep -c "healthy"
# Check specific service
docker-compose -f docker-compose.local.yml ps svc-ingestion
docker compose ps svc-ingestion
```
### Logs
@@ -53,16 +53,16 @@ docker-compose -f docker-compose.local.yml ps svc-ingestion
```bash
# View service logs
cd infra/compose
docker-compose -f docker-compose.local.yml logs -f SERVICE_NAME
docker compose logs -f SERVICE_NAME
# View last 50 lines
docker-compose -f docker-compose.local.yml logs --tail=50 SERVICE_NAME
docker compose logs --tail=50 SERVICE_NAME
# View logs since 5 minutes ago
docker-compose -f docker-compose.local.yml logs --since 5m SERVICE_NAME
docker compose logs --since 5m SERVICE_NAME
# Search logs for errors
docker-compose -f docker-compose.local.yml logs SERVICE_NAME | grep -i error
docker compose logs SERVICE_NAME | grep -i error
```
### Health Checks
@@ -70,7 +70,7 @@ docker-compose -f docker-compose.local.yml logs SERVICE_NAME | grep -i error
```bash
# Check Traefik health check status
cd infra/compose
docker-compose -f docker-compose.local.yml logs traefik --since 5m | grep -i "health"
docker compose logs traefik --since 5m | grep -i "health"
# Should show no errors (only certificate warnings are OK)
```
@@ -119,13 +119,13 @@ curl -X POST http://localhost:8000/upload \
```bash
# Check logs for errors
cd infra/compose
docker-compose -f docker-compose.local.yml logs SERVICE_NAME --tail=100
docker compose logs SERVICE_NAME --tail=100
# Restart service
docker-compose -f docker-compose.local.yml restart SERVICE_NAME
docker compose restart SERVICE_NAME
# Rebuild and restart
docker-compose -f docker-compose.local.yml up -d --build SERVICE_NAME
docker compose up -d --build SERVICE_NAME
```
### Infrastructure Issues
@@ -133,13 +133,13 @@ docker-compose -f docker-compose.local.yml up -d --build SERVICE_NAME
```bash
# Check infrastructure services
cd infra/compose
docker-compose -f docker-compose.local.yml ps postgres redis minio neo4j
docker compose ps postgres redis minio neo4j
# Restart infrastructure
docker-compose -f docker-compose.local.yml restart postgres redis minio neo4j
docker compose restart postgres redis minio neo4j
# Check connectivity
docker-compose -f docker-compose.local.yml exec svc-ingestion ping -c 3 postgres
docker compose exec svc-ingestion ping -c 3 postgres
```
### Health Check Failures
@@ -147,13 +147,13 @@ docker-compose -f docker-compose.local.yml exec svc-ingestion ping -c 3 postgres
```bash
# Check Traefik logs
cd infra/compose
docker-compose -f docker-compose.local.yml logs traefik --tail=100 | grep -i "health\|error"
docker compose logs traefik --tail=100 | grep -i "health\|error"
# Test health endpoint directly
docker-compose -f docker-compose.local.yml exec SERVICE_NAME curl -f http://localhost:8000/healthz
docker compose exec SERVICE_NAME curl -f http://localhost:8000/healthz
# Restart Traefik
docker-compose -f docker-compose.local.yml restart traefik
docker compose restart traefik
```
### Authentication Issues
@@ -191,10 +191,10 @@ open http://localhost:8080
```bash
# PostgreSQL
docker-compose -f infra/compose/docker-compose.local.yml exec postgres psql -U postgres
docker compose exec postgres psql -U postgres
# Redis
docker-compose -f infra/compose/docker-compose.local.yml exec redis redis-cli
docker compose exec redis redis-cli
# Neo4j Browser
open http://localhost:7474
@@ -206,14 +206,14 @@ open http://localhost:7474
```bash
cd infra/compose
docker-compose -f docker-compose.local.yml restart
docker compose restart
```
### Restart Single Service
```bash
cd infra/compose
docker-compose -f docker-compose.local.yml restart svc-ingestion
docker compose restart svc-ingestion
```
### View Service Configuration
@@ -280,6 +280,7 @@ make dev-service SERVICE=svc_ingestion
1. **Create Environment**: "AI Tax Agent - Development"
2. **Add Variables**:
- `base_url`: `http://localhost:8000`
- `auth_user`: `dev-user`
- `auth_email`: `dev@example.com`
@@ -337,13 +338,13 @@ docker-compose -f docker-compose.local.yml ps | grep svc-ingestion
### Common Issues
| Issue | Solution |
|-------|----------|
| 401 Unauthorized | Use `DISABLE_AUTH=true` or add auth headers |
| Connection refused | Check service is running: `docker-compose ps` |
| 500 Internal Error | Check logs: `docker-compose logs SERVICE_NAME` |
| Issue | Solution |
| -------------------- | ------------------------------------------------- |
| 401 Unauthorized | Use `DISABLE_AUTH=true` or add auth headers |
| Connection refused | Check service is running: `docker-compose ps` |
| 500 Internal Error | Check logs: `docker-compose logs SERVICE_NAME` |
| Health check failing | Check Traefik logs: `docker-compose logs traefik` |
| Port already in use | Stop conflicting service or change port |
| Port already in use | Stop conflicting service or change port |
## 🎯 Quick Commands
@@ -366,22 +367,22 @@ cd infra/compose && docker-compose -f docker-compose.local.yml down
## 🔄 Service Ports
| Service | Port | Access |
|---------|------|--------|
| svc-ingestion | 8000 | http://localhost:8000 |
| PostgreSQL | 5432 | localhost:5432 |
| Redis | 6379 | localhost:6379 |
| MinIO Console | 9093 | http://localhost:9093 |
| MinIO API | 9092 | http://localhost:9092 |
| Neo4j Browser | 7474 | http://localhost:7474 |
| Neo4j Bolt | 7687 | bolt://localhost:7687 |
| Qdrant | 6333 | http://localhost:6333 |
| NATS | 4222 | nats://localhost:4222 |
| Prometheus | 9090 | http://localhost:9090 |
| Grafana | 3000 | http://localhost:3000 |
| Service | Port | Access |
| ----------------- | ---- | --------------------- |
| svc-ingestion | 8000 | http://localhost:8000 |
| PostgreSQL | 5432 | localhost:5432 |
| Redis | 6379 | localhost:6379 |
| MinIO Console | 9093 | http://localhost:9093 |
| MinIO API | 9092 | http://localhost:9092 |
| Neo4j Browser | 7474 | http://localhost:7474 |
| Neo4j Bolt | 7687 | bolt://localhost:7687 |
| Qdrant | 6333 | http://localhost:6333 |
| NATS | 4222 | nats://localhost:4222 |
| Prometheus | 9090 | http://localhost:9090 |
| Grafana | 3000 | http://localhost:3000 |
| Traefik Dashboard | 8080 | http://localhost:8080 |
| Vault | 8200 | http://localhost:8200 |
| Unleash | 4242 | http://localhost:4242 |
| Vault | 8200 | http://localhost:8200 |
| Unleash | 4242 | http://localhost:4242 |
## ✅ Health Check
@@ -413,4 +414,3 @@ fi
```
Save this as `check-health.sh` and run with `bash check-health.sh`

BIN
docs/SA150-Notes-2025.pdf Normal file

Binary file not shown.

BIN
graphmert.pdf Normal file

Binary file not shown.

View File

@@ -2,6 +2,8 @@
Multi-environment Docker Compose infrastructure for AI Tax Agent.
For local development use the dedicated self-signed stack in `infra/compose` (see `infra/compose/README.md`). For remote environments use the shared base files with `infra/scripts/deploy.sh` and the envs in `infra/environments`.
## Directory Structure
```
@@ -244,4 +246,3 @@ For issues or questions:
- Check logs: `docker compose logs -f <service>`
- Review documentation in `docs/`
- Check Traefik dashboard for routing issues

View File

@@ -0,0 +1,370 @@
# FILE: blueprints/ai-tax-agent-bootstrap.yaml
# Authentik Bootstrap (v2025.x): users, groups, scope mappings, OIDC providers, applications
version: 1
metadata:
name: AI Tax Agent — Bootstrap + OIDC Providers
entries:
# --- Groups first (so the admin user can reference them) -------------------
- model: authentik_core.group
state: present
identifiers:
name: "Administrators"
attrs:
is_superuser: true
- model: authentik_core.group
state: present
identifiers:
name: "Tax Reviewers"
attrs:
is_superuser: false
- model: authentik_core.group
state: present
identifiers:
name: "Accountants"
attrs:
is_superuser: false
- model: authentik_core.group
state: present
identifiers:
name: "Clients"
attrs:
is_superuser: false
# --- Admin user ------------------------------------------------------------
- model: authentik_core.user
state: present
identifiers:
username: admin
attrs:
name: "System Administrator"
email: admin@local.lan
is_active: true
is_staff: true
is_superuser: true
groups:
- !Find [authentik_core.group, [name, "Administrators"]]
# Helper finders
# ========= OIDC Providers + Applications ==================================
# --- UI Review (Proxy Provider for ForwardAuth) ---------------------------
- model: authentik_providers_proxy.proxyprovider
state: present
identifiers:
name: "UI Review Proxy"
attrs:
external_host: "https://review.local.lan"
internal_host: "http://ui-review:3030"
authorization_flow:
!Find [authentik_flows.flow, [slug, "default-authentication-flow"]]
invalidation_flow:
!Find [authentik_flows.flow, [slug, "default-invalidation-flow"]]
mode: "forward_single"
cookie_domain: "local.lan"
- model: authentik_core.application
state: present
identifiers:
slug: "ui-review"
attrs:
name: "UI Review"
provider:
!Find [
authentik_providers_proxy.proxyprovider,
[name, "UI Review Proxy"],
]
meta_launch_url: "https://review.local.lan"
meta_description: "Tax Agent Platform - Review UI"
meta_publisher: "AI Tax Agent"
policy_engine_mode: "any"
# --- Vault OIDC Provider --------------------------------------------------
- model: authentik_providers_oauth2.oauth2provider
state: present
identifiers:
name: "Vault OIDC"
attrs:
client_id: "vault"
client_secret: !Env [AUTHENTIK_VAULT_CLIENT_SECRET, "changeme"]
client_type: "confidential"
redirect_uris:
- matching_mode: strict
url: "https://vault.local.lan/ui/vault/auth/oidc/oidc/callback"
- matching_mode: strict
url: "https://vault.local.lan/oidc/callback"
- matching_mode: strict
url: "http://localhost:8250/oidc/callback"
sub_mode: "hashed_user_id"
include_claims_in_id_token: true
issuer_mode: "per_provider"
signing_key:
!Find [
authentik_crypto.certificatekeypair,
[name, "authentik Self-signed Certificate"],
]
property_mappings:
- !Find [
authentik_providers_oauth2.scopemapping,
[scope_name, "openid"],
]
- !Find [authentik_providers_oauth2.scopemapping, [scope_name, "email"]]
- !Find [
authentik_providers_oauth2.scopemapping,
[scope_name, "profile"],
]
authorization_flow:
!Find [authentik_flows.flow, [slug, "default-authentication-flow"]]
invalidation_flow:
!Find [authentik_flows.flow, [slug, "default-invalidation-flow"]]
- model: authentik_core.application
state: present
identifiers:
slug: "vault-oidc"
attrs:
name: "Vault OIDC"
provider:
!Find [authentik_providers_oauth2.oauth2provider, [name, "Vault OIDC"]]
meta_launch_url: "https://vault.local.lan"
meta_description: "Vault OIDC Authentication"
meta_publisher: "AI Tax Agent"
policy_engine_mode: "any"
# --- MinIO OIDC Provider --------------------------------------------------
# Scope Mapping for MinIO Policy
- model: authentik_providers_oauth2.scopemapping
state: present
identifiers:
name: "MinIO Policy Mapping"
attrs:
name: "MinIO Policy Mapping"
description: "Maps Authentik users to MinIO policies"
scope_name: "minio"
expression: |
# Default to readwrite for all authenticated users
# You can customize this based on groups
return {
"policy": "readwrite"
}
- model: authentik_providers_oauth2.oauth2provider
state: present
identifiers:
name: "MinIO OIDC"
attrs:
client_id: "minio"
client_secret: !Env [AUTHENTIK_MINIO_CLIENT_SECRET, "changeme"]
client_type: "confidential"
redirect_uris:
- matching_mode: strict
url: "https://minio.local.lan/oauth_callback"
sub_mode: "hashed_user_id"
include_claims_in_id_token: true
issuer_mode: "per_provider"
signing_key:
!Find [
authentik_crypto.certificatekeypair,
[name, "authentik Self-signed Certificate"],
]
property_mappings:
- !Find [
authentik_providers_oauth2.scopemapping,
[scope_name, "openid"],
]
- !Find [authentik_providers_oauth2.scopemapping, [scope_name, "email"]]
- !Find [
authentik_providers_oauth2.scopemapping,
[scope_name, "profile"],
]
- !Find [
authentik_providers_oauth2.scopemapping,
[name, "MinIO Policy Mapping"],
]
authorization_flow:
!Find [authentik_flows.flow, [slug, "default-authentication-flow"]]
invalidation_flow:
!Find [authentik_flows.flow, [slug, "default-invalidation-flow"]]
- model: authentik_core.application
state: present
identifiers:
slug: "minio-oidc"
attrs:
name: "MinIO OIDC"
provider:
!Find [authentik_providers_oauth2.oauth2provider, [name, "MinIO OIDC"]]
meta_launch_url: "https://minio.local.lan"
meta_description: "MinIO Object Storage OIDC"
meta_publisher: "AI Tax Agent"
policy_engine_mode: "any"
# --- Grafana SSO Configuration -------------------------------------------
# Custom Role Mapping for Grafana
- model: authentik_providers_oauth2.scopemapping
state: present
identifiers:
name: "Grafana Role Mapping"
attrs:
name: "Grafana Role Mapping"
description: "Maps Authentik groups to Grafana roles"
scope_name: "role"
expression: |
# Map Authentik groups to Grafana roles
user_groups = [group.name for group in request.user.ak_groups.all()]
# Admin role mapping
if "authentik Admins" in user_groups or "Administrators" in user_groups:
return "Admin"
# Editor role mapping
if "Tax Reviewers" in user_groups or "Accountants" in user_groups:
return "Editor"
# Default to Viewer role
return "Viewer"
# Grafana OAuth2 Provider
- model: authentik_providers_oauth2.oauth2provider
state: present
identifiers:
name: "Grafana"
attrs:
client_id: !Env [GRAFANA_OAUTH_CLIENT_ID, "grafana"]
client_secret: !Env [GRAFANA_OAUTH_CLIENT_SECRET, "changeme"]
client_type: "confidential"
redirect_uris:
- matching_mode: strict
url: "https://grafana.local.lan/login/generic_oauth"
sub_mode: "hashed_user_id"
include_claims_in_id_token: true
issuer_mode: "per_provider"
signing_key:
!Find [
authentik_crypto.certificatekeypair,
[name, "authentik Self-signed Certificate"],
]
property_mappings:
- !Find [
authentik_providers_oauth2.scopemapping,
[scope_name, "openid"],
]
- !Find [authentik_providers_oauth2.scopemapping, [scope_name, "email"]]
- !Find [
authentik_providers_oauth2.scopemapping,
[scope_name, "profile"],
]
- !Find [
authentik_providers_oauth2.scopemapping,
[name, "Grafana Role Mapping"],
]
authorization_flow:
!Find [authentik_flows.flow, [slug, "default-authentication-flow"]]
invalidation_flow:
!Find [authentik_flows.flow, [slug, "default-invalidation-flow"]]
# Grafana Application
- model: authentik_core.application
state: present
identifiers:
slug: "grafana"
attrs:
name: "Grafana"
provider:
!Find [authentik_providers_oauth2.oauth2provider, [name, "Grafana"]]
meta_launch_url: "https://grafana.local.lan"
meta_description: "Grafana monitoring and observability platform"
meta_publisher: "Grafana Labs"
policy_engine_mode: "any"
# --- Traefik Dashboard (Proxy Provider for ForwardAuth) -------------------
- model: authentik_providers_proxy.proxyprovider
state: present
identifiers:
name: "Traefik Dashboard Proxy"
attrs:
external_host: "https://traefik.local.lan"
internal_host: "http://apa-traefik:8080"
authorization_flow:
!Find [authentik_flows.flow, [slug, "default-authentication-flow"]]
invalidation_flow:
!Find [authentik_flows.flow, [slug, "default-invalidation-flow"]]
mode: "forward_single"
cookie_domain: "local.lan"
- model: authentik_core.application
state: present
identifiers:
slug: "traefik-dashboard"
attrs:
name: "Traefik Dashboard"
provider:
!Find [
authentik_providers_proxy.proxyprovider,
[name, "Traefik Dashboard Proxy"],
]
meta_launch_url: "https://traefik.local.lan"
meta_description: "Traefik Edge Router Dashboard"
meta_publisher: "AI Tax Agent"
policy_engine_mode: "any"
# --- AI Tax Agent API (Proxy Provider for ForwardAuth) --------------------
- model: authentik_providers_proxy.proxyprovider
state: present
identifiers:
name: "AI Tax Agent API Proxy"
attrs:
external_host: "https://api.local.lan"
internal_host: "http://apa-traefik:8080"
authorization_flow:
!Find [authentik_flows.flow, [slug, "default-authentication-flow"]]
invalidation_flow:
!Find [authentik_flows.flow, [slug, "default-invalidation-flow"]]
mode: "forward_single"
cookie_domain: "local.lan"
- model: authentik_core.application
state: present
identifiers:
slug: "ai-tax-agent-api-gateway"
attrs:
name: "AI Tax Agent API Gateway"
provider:
!Find [
authentik_providers_proxy.proxyprovider,
[name, "AI Tax Agent API Proxy"],
]
meta_launch_url: "https://api.local.lan"
meta_description: "AI Tax Agent API Gateway"
meta_publisher: "AI Tax Agent"
policy_engine_mode: "any"
# --- Outpost Configuration ------------------------------------------------
- model: authentik_outposts.outpost
state: present
identifiers:
name: "authentik Embedded Outpost"
attrs:
token: !Env [AUTHENTIK_OUTPOST_TOKEN, "changeme"]
providers:
- !Find [
authentik_providers_proxy.proxyprovider,
[name, "Traefik Dashboard Proxy"],
]
- !Find [
authentik_providers_proxy.proxyprovider,
[name, "UI Review Proxy"],
]
- !Find [
authentik_providers_proxy.proxyprovider,
[name, "AI Tax Agent API Proxy"],
]

View File

@@ -20,6 +20,7 @@ volumes:
vault_data:
redis_data:
nats_data:
authentik_data:
services:
# Edge Gateway & SSO
@@ -37,6 +38,14 @@ services:
volumes:
- /var/run/docker.sock:/var/run/docker.sock:ro
- ./traefik/config/:/etc/traefik/:ro
labels:
- "traefik.enable=true"
- "traefik.http.routers.dashboard.rule=Host(`traefik.${DOMAIN}`)"
- "traefik.http.routers.dashboard.entrypoints=websecure"
- "traefik.http.routers.dashboard.tls=true"
- "traefik.http.routers.dashboard.tls.certresolver=${TRAEFIK_CERT_RESOLVER}"
- "traefik.http.routers.dashboard.service=api@internal"
- "traefik.http.routers.dashboard.middlewares=authentik-forwardauth@file"
# Identity & SSO (Authentik)
apa-authentik-db:
@@ -46,7 +55,7 @@ services:
networks:
- backend
volumes:
- postgres_data:/var/lib/postgresql/data
- authentik_data:/var/lib/postgresql/data
environment:
POSTGRES_DB: authentik
POSTGRES_USER: authentik
@@ -94,7 +103,7 @@ services:
- "traefik.http.routers.authentik.rule=Host(`auth.${DOMAIN}`)"
- "traefik.http.routers.authentik.entrypoints=websecure"
- "traefik.http.routers.authentik.tls=true"
- "traefik.http.routers.authentik.tls.certresolver=godaddy"
- "traefik.http.routers.authentik.tls.certresolver=${TRAEFIK_CERT_RESOLVER}"
- "traefik.http.services.authentik.loadbalancer.server.port=9000"
apa-authentik-worker:
@@ -149,18 +158,23 @@ services:
command: vault server -dev -dev-listen-address=0.0.0.0:8200
cap_add:
- IPC_LOCK
extra_hosts:
- "auth.local.lan:host-gateway"
- "vault.local.lan:host-gateway"
- "minio.local.lan:host-gateway"
- "api.local.lan:host-gateway"
- "traefik.local.lan:host-gateway"
labels:
- "traefik.enable=true"
- "traefik.http.routers.vault.rule=Host(`vault.${DOMAIN}`)"
- "traefik.http.routers.vault.entrypoints=websecure"
- "traefik.http.routers.vault.tls=true"
- "traefik.http.routers.vault.tls.certresolver=godaddy"
- "traefik.http.routers.vault.middlewares=authentik-forwardauth@file"
- "traefik.http.routers.vault.tls.certresolver=${TRAEFIK_CERT_RESOLVER}"
- "traefik.http.services.vault.loadbalancer.server.port=8200"
# Object Storage
apa-minio:
image: minio/minio:RELEASE.2025-09-07T16-13-09Z
image: minio/minio:RELEASE.2025-04-22T22-12-26Z
container_name: apa-minio
restart: unless-stopped
networks:
@@ -172,26 +186,35 @@ services:
MINIO_ROOT_USER: ${MINIO_ROOT_USER}
MINIO_ROOT_PASSWORD: ${MINIO_ROOT_PASSWORD}
MINIO_BROWSER_REDIRECT_URL: https://minio.${DOMAIN}
MINIO_IDENTITY_OPENID_CONFIG_URL: "https://auth.${DOMAIN}/application/o/minio-oidc/.well-known/openid-configuration"
MINIO_IDENTITY_OPENID_CLIENT_ID: "minio"
MINIO_IDENTITY_OPENID_CLIENT_SECRET: ${AUTHENTIK_MINIO_CLIENT_SECRET}
MINIO_IDENTITY_OPENID_SCOPES: "openid,profile,email,minio"
MINIO_IDENTITY_OPENID_REDIRECT_URI: "https://minio.${DOMAIN}/oauth_callback"
MINIO_IDENTITY_OPENID_DISPLAY_NAME: "Login with Authentik"
command: server /data --address ":9092" --console-address ":9093"
healthcheck:
test: ["CMD", "mc", "--version"]
test: ["CMD", "curl", "-f", "http://localhost:9092/minio/health/live"]
interval: 30s
timeout: 20s
retries: 3
extra_hosts:
- "auth.local.lan:host-gateway"
- "minio.local.lan:host-gateway"
- "api.local.lan:host-gateway"
- "traefik.local.lan:host-gateway"
labels:
- "traefik.enable=true"
- "traefik.http.routers.minio-api.rule=Host(`minio-api.${DOMAIN}`)"
- "traefik.http.routers.minio-api.entrypoints=websecure"
- "traefik.http.routers.minio-api.tls=true"
- "traefik.http.routers.minio-api.tls.certresolver=godaddy"
- "traefik.http.routers.minio-api.middlewares=authentik-forwardauth@file"
- "traefik.http.routers.minio-api.tls.certresolver=${TRAEFIK_CERT_RESOLVER}"
- "traefik.http.routers.minio-api.service=minio-api"
- "traefik.http.services.minio-api.loadbalancer.server.port=9092"
- "traefik.http.routers.minio-console.rule=Host(`minio.${DOMAIN}`)"
- "traefik.http.routers.minio-console.entrypoints=websecure"
- "traefik.http.routers.minio-console.tls=true"
- "traefik.http.routers.minio-console.tls.certresolver=godaddy"
- "traefik.http.routers.minio-console.middlewares=authentik-forwardauth@file"
- "traefik.http.routers.minio-console.tls.certresolver=${TRAEFIK_CERT_RESOLVER}"
- "traefik.http.routers.minio-console.service=minio-console"
- "traefik.http.services.minio-console.loadbalancer.server.port=9093"
@@ -214,7 +237,7 @@ services:
- "traefik.http.routers.qdrant.rule=Host(`qdrant.${DOMAIN}`)"
- "traefik.http.routers.qdrant.entrypoints=websecure"
- "traefik.http.routers.qdrant.tls=true"
- "traefik.http.routers.qdrant.tls.certresolver=godaddy"
- "traefik.http.routers.qdrant.tls.certresolver=${TRAEFIK_CERT_RESOLVER}"
- "traefik.http.routers.qdrant.middlewares=authentik-forwardauth@file"
- "traefik.http.services.qdrant.loadbalancer.server.port=6333"
@@ -242,7 +265,7 @@ services:
- "traefik.http.routers.neo4j.rule=Host(`neo4j.${DOMAIN}`)"
- "traefik.http.routers.neo4j.entrypoints=websecure"
- "traefik.http.routers.neo4j.tls=true"
- "traefik.http.routers.neo4j.tls.certresolver=godaddy"
- "traefik.http.routers.neo4j.tls.certresolver=${TRAEFIK_CERT_RESOLVER}"
- "traefik.http.routers.neo4j.middlewares=authentik-forwardauth@file"
- "traefik.http.services.neo4j.loadbalancer.server.port=7474"
@@ -334,6 +357,6 @@ services:
- "traefik.http.routers.nats-monitor.rule=Host(`nats.${DOMAIN}`)"
- "traefik.http.routers.nats-monitor.entrypoints=websecure"
- "traefik.http.routers.nats-monitor.tls=true"
- "traefik.http.routers.nats-monitor.tls.certresolver=godaddy"
- "traefik.http.routers.nats-monitor.tls.certresolver=${TRAEFIK_CERT_RESOLVER}"
- "traefik.http.routers.nats-monitor.middlewares=authentik-forwardauth@file"
- "traefik.http.services.nats-monitor.loadbalancer.server.port=8222"

30
infra/base/loki/loki.yml Normal file
View File

@@ -0,0 +1,30 @@
auth_enabled: false
server:
http_listen_port: 3100
grpc_listen_port: 9096
common:
instance_addr: 127.0.0.1
path_prefix: /loki
storage:
filesystem:
chunks_directory: /loki/chunks
rules_directory: /loki/rules
replication_factor: 1
ring:
kvstore:
store: inmemory
schema_config:
configs:
- from: 2020-10-24
store: boltdb-shipper
object_store: filesystem
schema: v11
index:
prefix: index_
period: 24h
ruler:
alertmanager_url: http://localhost:9093

View File

@@ -0,0 +1,26 @@
server:
http_listen_port: 9080
grpc_listen_port: 0
positions:
filename: /tmp/positions.yaml
clients:
- url: http://apa-loki:3100/loki/api/v1/push
scrape_configs:
- job_name: system
static_configs:
- targets:
- localhost
labels:
job: varlogs
__path__: /var/log/*log
- job_name: docker
static_configs:
- targets:
- localhost
labels:
job: docker
__path__: /var/lib/docker/containers/*/*-json.log

View File

@@ -39,7 +39,7 @@ services:
- "traefik.http.routers.prometheus.rule=Host(`prometheus.${DOMAIN}`)"
- "traefik.http.routers.prometheus.entrypoints=websecure"
- "traefik.http.routers.prometheus.tls=true"
- "traefik.http.routers.prometheus.tls.certresolver=godaddy"
- "traefik.http.routers.prometheus.tls.certresolver=${TRAEFIK_CERT_RESOLVER}"
- "traefik.http.routers.prometheus.middlewares=authentik-forwardauth@file"
- "traefik.http.services.prometheus.loadbalancer.server.port=9090"
@@ -80,12 +80,19 @@ services:
GF_SECURITY_COOKIE_SECURE: true
GF_SECURITY_COOKIE_SAMESITE: lax
GF_AUTH_GENERIC_OAUTH_USE_PKCE: true
GF_AUTH_GENERIC_OAUTH_TLS_SKIP_VERIFY_INSECURE: true
GF_AUTH_SIGNOUT_REDIRECT_URL: https://auth.${DOMAIN}/application/o/grafana/end-session/
extra_hosts:
- "auth.local.lan:host-gateway"
- "grafana.local.lan:host-gateway"
- "api.local.lan:host-gateway"
- "traefik.local.lan:host-gateway"
labels:
- "traefik.enable=true"
- "traefik.http.routers.grafana.rule=Host(`grafana.${DOMAIN}`)"
- "traefik.http.routers.grafana.entrypoints=websecure"
- "traefik.http.routers.grafana.tls=true"
- "traefik.http.routers.grafana.tls.certresolver=godaddy"
- "traefik.http.routers.grafana.tls.certresolver=${TRAEFIK_CERT_RESOLVER}"
- "traefik.http.services.grafana.loadbalancer.server.port=3000"
# Log Aggregation
@@ -105,7 +112,7 @@ services:
- "traefik.http.routers.loki.rule=Host(`loki.${DOMAIN}`)"
- "traefik.http.routers.loki.entrypoints=websecure"
- "traefik.http.routers.loki.tls=true"
- "traefik.http.routers.loki.tls.certresolver=godaddy"
- "traefik.http.routers.loki.tls.certresolver=${TRAEFIK_CERT_RESOLVER}"
- "traefik.http.routers.loki.middlewares=authentik-forwardauth@file"
- "traefik.http.services.loki.loadbalancer.server.port=3100"

View File

@@ -0,0 +1,21 @@
global:
scrape_interval: 15s
evaluation_interval: 15s
scrape_configs:
- job_name: "prometheus"
static_configs:
- targets: ["localhost:9090"]
- job_name: "traefik"
static_configs:
- targets: ["apa-traefik:8080"]
- job_name: "services"
static_configs:
- targets:
- "apa-svc-ingestion:8000"
- "apa-svc-extract:8000"
- "apa-svc-kg:8000"
- "apa-svc-rag-retriever:8000"
- "apa-svc-rag-indexer:8000"

View File

@@ -40,8 +40,8 @@ services:
- "traefik.http.routers.svc-ingestion.rule=Host(`api.${DOMAIN}`) && PathPrefix(`/ingestion`)"
- "traefik.http.routers.svc-ingestion.entrypoints=websecure"
- "traefik.http.routers.svc-ingestion.tls=true"
- "traefik.http.routers.svc-ingestion.tls.certresolver=godaddy"
- "traefik.http.routers.svc-ingestion.middlewares=authentik-forwardauth@file,rate-limit@file"
- "traefik.http.routers.svc-ingestion.tls.certresolver=${TRAEFIK_CERT_RESOLVER}"
- "traefik.http.routers.svc-ingestion.middlewares=authentik-forwardauth@file,rate-limit@file,strip-api-prefixes@file"
- "traefik.http.services.svc-ingestion.loadbalancer.server.port=8000"
# Data Extraction Service
@@ -73,8 +73,8 @@ services:
- "traefik.http.routers.svc-extract.rule=Host(`api.${DOMAIN}`) && PathPrefix(`/extract`)"
- "traefik.http.routers.svc-extract.entrypoints=websecure"
- "traefik.http.routers.svc-extract.tls=true"
- "traefik.http.routers.svc-extract.tls.certresolver=godaddy"
- "traefik.http.routers.svc-extract.middlewares=authentik-forwardauth@file,rate-limit@file"
- "traefik.http.routers.svc-extract.tls.certresolver=${TRAEFIK_CERT_RESOLVER}"
- "traefik.http.routers.svc-extract.middlewares=authentik-forwardauth@file,rate-limit@file,strip-api-prefixes@file"
- "traefik.http.services.svc-extract.loadbalancer.server.port=8000"
# Knowledge Graph Service
@@ -100,8 +100,8 @@ services:
- "traefik.http.routers.svc-kg.rule=Host(`api.${DOMAIN}`) && PathPrefix(`/kg`)"
- "traefik.http.routers.svc-kg.entrypoints=websecure"
- "traefik.http.routers.svc-kg.tls=true"
- "traefik.http.routers.svc-kg.tls.certresolver=godaddy"
- "traefik.http.routers.svc-kg.middlewares=authentik-forwardauth@file,rate-limit@file"
- "traefik.http.routers.svc-kg.tls.certresolver=${TRAEFIK_CERT_RESOLVER}"
- "traefik.http.routers.svc-kg.middlewares=authentik-forwardauth@file,rate-limit@file,strip-api-prefixes@file"
- "traefik.http.services.svc-kg.loadbalancer.server.port=8000"
# RAG Retrieval Service
@@ -130,8 +130,8 @@ services:
- "traefik.http.routers.svc-rag-retriever.rule=Host(`api.${DOMAIN}`) && PathPrefix(`/rag`)"
- "traefik.http.routers.svc-rag-retriever.entrypoints=websecure"
- "traefik.http.routers.svc-rag-retriever.tls=true"
- "traefik.http.routers.svc-rag-retriever.tls.certresolver=godaddy"
- "traefik.http.routers.svc-rag-retriever.middlewares=authentik-forwardauth@file,rate-limit@file"
- "traefik.http.routers.svc-rag-retriever.tls.certresolver=${TRAEFIK_CERT_RESOLVER}"
- "traefik.http.routers.svc-rag-retriever.middlewares=authentik-forwardauth@file,rate-limit@file,strip-api-prefixes@file"
- "traefik.http.services.svc-rag-retriever.loadbalancer.server.port=8000"
# Forms Service
@@ -163,8 +163,8 @@ services:
- "traefik.http.routers.svc-forms.rule=Host(`api.${DOMAIN}`) && PathPrefix(`/forms`)"
- "traefik.http.routers.svc-forms.entrypoints=websecure"
- "traefik.http.routers.svc-forms.tls=true"
- "traefik.http.routers.svc-forms.tls.certresolver=godaddy"
- "traefik.http.routers.svc-forms.middlewares=authentik-forwardauth@file,rate-limit@file"
- "traefik.http.routers.svc-forms.tls.certresolver=${TRAEFIK_CERT_RESOLVER}"
- "traefik.http.routers.svc-forms.middlewares=authentik-forwardauth@file,rate-limit@file,strip-api-prefixes@file"
- "traefik.http.services.svc-forms.loadbalancer.server.port=8000"
# HMRC Integration Service
@@ -197,8 +197,8 @@ services:
- "traefik.http.routers.svc-hmrc.rule=Host(`api.${DOMAIN}`) && PathPrefix(`/hmrc`)"
- "traefik.http.routers.svc-hmrc.entrypoints=websecure"
- "traefik.http.routers.svc-hmrc.tls=true"
- "traefik.http.routers.svc-hmrc.tls.certresolver=godaddy"
- "traefik.http.routers.svc-hmrc.middlewares=authentik-forwardauth@file,rate-limit@file"
- "traefik.http.routers.svc-hmrc.tls.certresolver=${TRAEFIK_CERT_RESOLVER}"
- "traefik.http.routers.svc-hmrc.middlewares=authentik-forwardauth@file,rate-limit@file,strip-api-prefixes@file"
- "traefik.http.services.svc-hmrc.loadbalancer.server.port=8000"
# OCR Service
@@ -230,8 +230,8 @@ services:
- "traefik.http.routers.svc-ocr.rule=Host(`api.${DOMAIN}`) && PathPrefix(`/ocr`)"
- "traefik.http.routers.svc-ocr.entrypoints=websecure"
- "traefik.http.routers.svc-ocr.tls=true"
- "traefik.http.routers.svc-ocr.tls.certresolver=godaddy"
- "traefik.http.routers.svc-ocr.middlewares=authentik-forwardauth@file,rate-limit@file"
- "traefik.http.routers.svc-ocr.tls.certresolver=${TRAEFIK_CERT_RESOLVER}"
- "traefik.http.routers.svc-ocr.middlewares=authentik-forwardauth@file,rate-limit@file,strip-api-prefixes@file"
- "traefik.http.services.svc-ocr.loadbalancer.server.port=8000"
# RAG Indexer Service
@@ -263,8 +263,8 @@ services:
- "traefik.http.routers.svc-rag-indexer.rule=Host(`api.${DOMAIN}`) && PathPrefix(`/rag-indexer`)"
- "traefik.http.routers.svc-rag-indexer.entrypoints=websecure"
- "traefik.http.routers.svc-rag-indexer.tls=true"
- "traefik.http.routers.svc-rag-indexer.tls.certresolver=godaddy"
- "traefik.http.routers.svc-rag-indexer.middlewares=authentik-forwardauth@file,rate-limit@file"
- "traefik.http.routers.svc-rag-indexer.tls.certresolver=${TRAEFIK_CERT_RESOLVER}"
- "traefik.http.routers.svc-rag-indexer.middlewares=authentik-forwardauth@file,rate-limit@file,strip-api-prefixes@file"
- "traefik.http.services.svc-rag-indexer.loadbalancer.server.port=8000"
# Reasoning Service
@@ -296,8 +296,8 @@ services:
- "traefik.http.routers.svc-reason.rule=Host(`api.${DOMAIN}`) && PathPrefix(`/reason`)"
- "traefik.http.routers.svc-reason.entrypoints=websecure"
- "traefik.http.routers.svc-reason.tls=true"
- "traefik.http.routers.svc-reason.tls.certresolver=godaddy"
- "traefik.http.routers.svc-reason.middlewares=authentik-forwardauth@file,rate-limit@file"
- "traefik.http.routers.svc-reason.tls.certresolver=${TRAEFIK_CERT_RESOLVER}"
- "traefik.http.routers.svc-reason.middlewares=authentik-forwardauth@file,rate-limit@file,strip-api-prefixes@file"
- "traefik.http.services.svc-reason.loadbalancer.server.port=8000"
# RPA Service
@@ -329,8 +329,8 @@ services:
- "traefik.http.routers.svc-rpa.rule=Host(`api.${DOMAIN}`) && PathPrefix(`/rpa`)"
- "traefik.http.routers.svc-rpa.entrypoints=websecure"
- "traefik.http.routers.svc-rpa.tls=true"
- "traefik.http.routers.svc-rpa.tls.certresolver=godaddy"
- "traefik.http.routers.svc-rpa.middlewares=authentik-forwardauth@file,rate-limit@file"
- "traefik.http.routers.svc-rpa.tls.certresolver=${TRAEFIK_CERT_RESOLVER}"
- "traefik.http.routers.svc-rpa.middlewares=authentik-forwardauth@file,rate-limit@file,strip-api-prefixes@file"
- "traefik.http.services.svc-rpa.loadbalancer.server.port=8000"
# Normalize & Map Service
@@ -362,8 +362,8 @@ services:
- "traefik.http.routers.svc-normalize-map.rule=Host(`api.${DOMAIN}`) && PathPrefix(`/normalize-map`)"
- "traefik.http.routers.svc-normalize-map.entrypoints=websecure"
- "traefik.http.routers.svc-normalize-map.tls=true"
- "traefik.http.routers.svc-normalize-map.tls.certresolver=godaddy"
- "traefik.http.routers.svc-normalize-map.middlewares=authentik-forwardauth@file,rate-limit@file"
- "traefik.http.routers.svc-normalize-map.tls.certresolver=${TRAEFIK_CERT_RESOLVER}"
- "traefik.http.routers.svc-normalize-map.middlewares=authentik-forwardauth@file,rate-limit@file,strip-api-prefixes@file"
- "traefik.http.services.svc-normalize-map.loadbalancer.server.port=8000"
# Coverage Service
@@ -395,8 +395,8 @@ services:
- "traefik.http.routers.svc-coverage.rule=Host(`api.${DOMAIN}`) && PathPrefix(`/coverage`)"
- "traefik.http.routers.svc-coverage.entrypoints=websecure"
- "traefik.http.routers.svc-coverage.tls=true"
- "traefik.http.routers.svc-coverage.tls.certresolver=godaddy"
- "traefik.http.routers.svc-coverage.middlewares=authentik-forwardauth@file,rate-limit@file"
- "traefik.http.routers.svc-coverage.tls.certresolver=${TRAEFIK_CERT_RESOLVER}"
- "traefik.http.routers.svc-coverage.middlewares=authentik-forwardauth@file,rate-limit@file,strip-api-prefixes@file"
- "traefik.http.services.svc-coverage.loadbalancer.server.port=8000"
# Firm Connectors Service
@@ -428,8 +428,8 @@ services:
- "traefik.http.routers.svc-firm-connectors.rule=Host(`api.${DOMAIN}`) && PathPrefix(`/firm-connectors`)"
- "traefik.http.routers.svc-firm-connectors.entrypoints=websecure"
- "traefik.http.routers.svc-firm-connectors.tls=true"
- "traefik.http.routers.svc-firm-connectors.tls.certresolver=godaddy"
- "traefik.http.routers.svc-firm-connectors.middlewares=authentik-forwardauth@file,rate-limit@file"
- "traefik.http.routers.svc-firm-connectors.tls.certresolver=${TRAEFIK_CERT_RESOLVER}"
- "traefik.http.routers.svc-firm-connectors.middlewares=authentik-forwardauth@file,rate-limit@file,strip-api-prefixes@file"
- "traefik.http.services.svc-firm-connectors.loadbalancer.server.port=8000"
# Review UI
@@ -448,6 +448,6 @@ services:
- "traefik.http.routers.ui-review.rule=Host(`app.${DOMAIN}`)"
- "traefik.http.routers.ui-review.entrypoints=websecure"
- "traefik.http.routers.ui-review.tls=true"
- "traefik.http.routers.ui-review.tls.certresolver=godaddy"
- "traefik.http.routers.ui-review.tls.certresolver=${TRAEFIK_CERT_RESOLVER}"
- "traefik.http.routers.ui-review.middlewares=authentik-forwardauth@file"
- "traefik.http.services.ui-review.loadbalancer.server.port=3030"

View File

@@ -1,133 +1,23 @@
# External Services
# Compose Stacks
This directory contains Docker Compose configurations for external services that run on the production server.
This folder is for the self-contained local stack (self-signed TLS) and Traefik assets. Remote environments use the shared compose files in `infra/base` together with `infra/scripts/deploy.sh`.
## Services
## Local development (self-signed TLS)
- Copy envs: `cp infra/compose/env.example infra/compose/.env` then set passwords/secrets and the dev domain (defaults to `local.lan`).
- Host aliases: add the domain to `/etc/hosts` (e.g. `127.0.0.1 auth.local.lan api.local.lan grafana.local.lan vault.local.lan minio.local.lan`).
- Networks: `./infra/scripts/setup-networks.sh` (creates `apa-frontend` and `apa-backend` used everywhere).
- Run: `cd infra/compose && docker compose --env-file .env -f docker-compose.local.yml up -d`.
- Stop: `docker compose --env-file .env -f docker-compose.local.yml down`.
- TLS: Traefik mounts `infra/compose/traefik/certs/local.{crt,key}`. Regenerate if needed with `openssl req -x509 -newkey rsa:2048 -nodes -keyout infra/compose/traefik/certs/local.key -out infra/compose/traefik/certs/local.crt -days 365 -subj "/CN=*.local.lan"`.
### Traefik
- **Location**: `traefik/`
- **Purpose**: Reverse proxy and load balancer for all services
- **Deploy**: `cd traefik && docker compose up -d`
- **Access**: https://traefik.harkon.co.uk
## Cloud / remote (Lets Encrypt)
- Config lives in `infra/base` with env files in `infra/environments/{development,production}/.env`.
- Create the same docker networks on the host (`./infra/scripts/setup-networks.sh`) so Traefik and services share `apa-frontend` / `apa-backend`.
- Deploy on the server: `./infra/scripts/deploy.sh <environment> all` (or `infrastructure`, `monitoring`, `services`).
- Certificates: Traefik uses DNS-01 via GoDaddy from the provider env in `infra/base/traefik/config` (make sure `DOMAIN`, ACME email, and provider creds are set in the env file).
### Authentik
- **Location**: `authentik/`
- **Purpose**: SSO and authentication provider
- **Deploy**: `cd authentik && docker compose up -d`
- **Access**: https://authentik.harkon.co.uk
### Gitea
- **Location**: `gitea/`
- **Purpose**: Git repository hosting and container registry
- **Deploy**: `cd gitea && docker compose up -d`
- **Access**: https://gitea.harkon.co.uk
### Nextcloud
- **Location**: `nextcloud/`
- **Purpose**: File storage and collaboration
- **Deploy**: `cd nextcloud && docker compose up -d`
- **Access**: https://nextcloud.harkon.co.uk
### Portainer
- **Location**: `portainer/`
- **Purpose**: Docker management UI
- **Deploy**: `cd portainer && docker compose up -d`
- **Access**: https://portainer.harkon.co.uk
## Deployment
### Production (Remote Server)
```bash
# SSH to server
ssh deploy@141.136.35.199
# Navigate to service directory
cd /opt/ai-tax-agent/infra/compose/<service>
# Deploy service
docker compose up -d
# Check logs
docker compose logs -f
# Check status
docker compose ps
```
### Local Development
For local development, use the all-in-one compose file:
```bash
cd infra/compose
docker compose -f docker-compose.local.yml up -d
```
## Configuration
Each service has its own `.env` file for environment-specific configuration:
- `traefik/.provider.env` - GoDaddy API credentials
- `authentik/.env` - Authentik secrets
- `gitea/.env` - Gitea database credentials
## Networks
All services use shared Docker networks:
- `frontend` - Public-facing services
- `backend` - Internal services
Create networks before deploying:
```bash
docker network create frontend
docker network create backend
```
## Maintenance
### Update Service
```bash
cd /opt/ai-tax-agent/infra/compose/<service>
docker compose pull
docker compose up -d
```
### Restart Service
```bash
cd /opt/ai-tax-agent/infra/compose/<service>
docker compose restart
```
### View Logs
```bash
cd /opt/ai-tax-agent/infra/compose/<service>
docker compose logs -f
```
### Backup Data
```bash
# Backup volumes
docker run --rm -v <service>_data:/data -v $(pwd):/backup alpine tar czf /backup/<service>-backup.tar.gz /data
```
## Integration with Application
These external services are used by the application infrastructure:
- **Traefik** - Routes traffic to application services
- **Authentik** - Provides SSO for application UIs
- **Gitea** - Hosts Docker images for application services
The application infrastructure is deployed separately using:
```bash
./infra/scripts/deploy.sh production infrastructure
./infra/scripts/deploy.sh production services
```
## Files of note
- `docker-compose.local.yml` full local stack.
- `traefik/traefik.local.yml` and `traefik/traefik-dynamic.local.yml` static/dynamic Traefik config for local.
- `traefik/certs/` self-signed certs used by the local proxy.
- `env.example` defaults for local `.env`.

View File

@@ -0,0 +1,156 @@
# FILE: infra/compose/compose.override.yaml
# Local development overrides
# Automatically loaded by docker compose when compose.yaml is present
services:
# --- Infrastructure Overrides ---
apa-traefik:
volumes:
- ./traefik/traefik.local.yml:/etc/traefik/traefik.yml:ro
- ./traefik/traefik-dynamic.local.yml:/etc/traefik/traefik-dynamic.yml:ro
- ./traefik/certs/:/var/traefik/certs/:ro
ports:
- "8080:8080" # Dashboard (admin entrypoint, insecure mode only for local)
apa-authentik-server:
environment:
AUTHENTIK_ERROR_REPORTING__ENABLED: "false"
DOMAIN: ${DOMAIN:-local.lan}
GRAFANA_OAUTH_CLIENT_ID: ${GRAFANA_OAUTH_CLIENT_ID}
GRAFANA_OAUTH_CLIENT_SECRET: ${GRAFANA_OAUTH_CLIENT_SECRET}
AUTHENTIK_MINIO_CLIENT_SECRET: ${AUTHENTIK_MINIO_CLIENT_SECRET}
AUTHENTIK_VAULT_CLIENT_SECRET: ${AUTHENTIK_VAULT_CLIENT_SECRET}
AUTHENTIK_OUTPOST_TOKEN: ${AUTHENTIK_OUTPOST_TOKEN}
volumes:
- ../authentik/bootstrap.yaml:/blueprints/ai-tax-agent-bootstrap.yaml:ro
apa-authentik-worker:
environment:
DOMAIN: ${DOMAIN:-local.lan}
GRAFANA_OAUTH_CLIENT_ID: ${GRAFANA_OAUTH_CLIENT_ID}
GRAFANA_OAUTH_CLIENT_SECRET: ${GRAFANA_OAUTH_CLIENT_SECRET}
AUTHENTIK_MINIO_CLIENT_SECRET: ${AUTHENTIK_MINIO_CLIENT_SECRET}
AUTHENTIK_VAULT_CLIENT_SECRET: ${AUTHENTIK_VAULT_CLIENT_SECRET}
AUTHENTIK_OUTPOST_TOKEN: ${AUTHENTIK_OUTPOST_TOKEN}
volumes:
- ../authentik/bootstrap.yaml:/blueprints/ai-tax-agent-bootstrap.yaml:ro
apa-vault:
volumes:
- ./traefik/certs/:/certs:ro
# --- Service Build Overrides ---
# Pointing to local source code for building
apa-svc-ingestion:
build:
context: ../../
dockerfile: apps/svc_ingestion/Dockerfile
image: ai-tax-agent/svc-ingestion:local
pull_policy: never
apa-svc-extract:
build:
context: ../../
dockerfile: apps/svc_extract/Dockerfile
image: ai-tax-agent/svc-extract:local
pull_policy: never
apa-svc-kg:
build:
context: ../../
dockerfile: apps/svc_kg/Dockerfile
image: ai-tax-agent/svc-kg:local
pull_policy: never
apa-svc-rag-retriever:
build:
context: ../../
dockerfile: apps/svc_rag_retriever/Dockerfile
image: ai-tax-agent/svc-rag-retriever:local
pull_policy: never
apa-svc-forms:
build:
context: ../../
dockerfile: apps/svc_forms/Dockerfile
image: ai-tax-agent/svc-forms:local
pull_policy: never
apa-svc-hmrc:
build:
context: ../../
dockerfile: apps/svc_hmrc/Dockerfile
image: ai-tax-agent/svc-hmrc:local
pull_policy: never
apa-svc-ocr:
build:
context: ../../
dockerfile: apps/svc_ocr/Dockerfile
image: ai-tax-agent/svc-ocr:local
pull_policy: never
restart: on-failure
apa-svc-rag-indexer:
build:
context: ../../
dockerfile: apps/svc_rag_indexer/Dockerfile
image: ai-tax-agent/svc-rag-indexer:local
pull_policy: never
apa-svc-reason:
build:
context: ../../
dockerfile: apps/svc_reason/Dockerfile
image: ai-tax-agent/svc-reason:local
pull_policy: never
apa-svc-rpa:
build:
context: ../../
dockerfile: apps/svc_rpa/Dockerfile
image: ai-tax-agent/svc-rpa:local
pull_policy: never
apa-svc-normalize-map:
build:
context: ../../
dockerfile: apps/svc_normalize_map/Dockerfile
image: ai-tax-agent/svc-normalize-map:local
pull_policy: never
apa-svc-coverage:
build:
context: ../../
dockerfile: apps/svc_coverage/Dockerfile
image: ai-tax-agent/svc-coverage:local
pull_policy: never
apa-svc-firm-connectors:
build:
context: ../../
dockerfile: apps/svc_firm_connectors/Dockerfile
image: ai-tax-agent/svc-firm-connectors:local
pull_policy: never
apa-ui-review:
# UI might not have a Dockerfile in root/ui-review/Dockerfile based on previous file view
# Assuming standard build context if it exists, otherwise comment out build
# build:
# context: ../../ui-review
# dockerfile: Dockerfile
image: alpine:latest
profiles: ["disabled"]
environment:
- NEXTAUTH_URL=https://app.local.lan
- API_BASE_URL=https://api.local.lan
apa-minio:
volumes:
- ./traefik/certs/local.crt:/root/.minio/certs/CAs/local.crt:ro
# --- Local Development Specific Services ---
# Services that only exist in local dev (e.g. mailhog if used, or specific tools)
# None identified from docker-compose.local.yml that aren't in base

View File

@@ -0,0 +1,14 @@
# FILE: infra/compose/compose.yaml
# Main entry point for Docker Compose
# Includes base configurations from infra/base/
include:
- ../base/infrastructure.yaml
- ../base/services.yaml
# Monitoring stack is optional for local dev but included for completeness
# Can be disabled via profiles if needed, but keeping simple for now
- ../base/monitoring.yaml
# Define project name to match existing convention if needed,
# though 'compose' directory name usually defaults to 'compose'
name: ai-tax-agent

File diff suppressed because it is too large Load Diff

View File

@@ -1,7 +1,7 @@
# FILE: infra/compose/env.example
# Domain Configuration
DOMAIN=local
DOMAIN=local.lan
EMAIL=admin@local.lan
# Database Passwords
@@ -26,6 +26,7 @@ AUTHENTIK_SECRET_KEY=changeme
AUTHENTIK_OUTPOST_TOKEN=changeme
AUTHENTIK_BOOTSTRAP_EMAIL=admin@local.lan
AUTHENTIK_BOOTSTRAP_PASSWORD=admin123
# AUTHENTIK_BOOTSTRAP_TOKEN: This value will be automatically updated after the initial setup.
AUTHENTIK_BOOTSTRAP_TOKEN=
# Monitoring
@@ -80,7 +81,7 @@ PII_LOG_RETENTION_DAYS=30
# Backup & DR
BACKUP_ENABLED=true
BACKUP_SCHEDULE=0 2 * * *
BACKUP_SCHEDULE="0 2 * * *"
BACKUP_RETENTION_DAYS=30
# Performance Tuning

View File

@@ -0,0 +1,89 @@
http:
middlewares:
authentik-forwardauth:
forwardAuth:
address: "http://apa-authentik-outpost:9000/outpost.goauthentik.io/auth/traefik"
trustForwardHeader: true
authResponseHeaders:
- X-authentik-username
- X-authentik-groups
- X-authentik-email
- X-authentik-name
- X-authentik-uid
- X-authentik-jwt
- X-authentik-meta-jwks
- X-authentik-meta-outpost
- X-authentik-meta-provider
- X-authentik-meta-app
- X-authentik-meta-version
# Large upload middleware for Gitea registry
gitea-large-upload:
buffering:
maxRequestBodyBytes: 5368709120 # 5GB
memRequestBodyBytes: 104857600 # 100MB
maxResponseBodyBytes: 5368709120 # 5GB
memResponseBodyBytes: 104857600 # 100MB
retryExpression: "IsNetworkError() && Attempts() < 3"
# Rate limiting for public APIs
rate-limit:
rateLimit:
average: 100
burst: 50
period: 1s
# Security headers
security-headers:
headers:
frameDeny: true
sslRedirect: true
browserXssFilter: true
contentTypeNosniff: true
stsIncludeSubdomains: true
stsPreload: true
stsSeconds: 31536000
# CORS headers
api-cors:
headers:
accessControlAllowMethods:
- GET
- POST
- PUT
- DELETE
- OPTIONS
accessControlAllowOriginList:
- "https://app.harkon.co.uk"
accessControlAllowHeaders:
- "Content-Type"
- "Authorization"
accessControlMaxAge: 100
addVaryHeader: true
# Strip API prefixes
strip-api-prefixes:
stripPrefix:
prefixes:
- "/rag-indexer"
- "/firm-connectors"
- "/normalize-map"
- "/ingestion"
- "/extract"
- "/forms"
- "/hmrc"
- "/ocr"
- "/reason"
- "/rpa"
- "/coverage"
- "/kg"
- "/rag"
tls:
certificates:
- certFile: /var/traefik/certs/local.crt
keyFile: /var/traefik/certs/local.key
options:
default:
minVersion: VersionTLS12
sniStrict: false

View File

@@ -0,0 +1,35 @@
# Traefik static configuration for local development (self-signed TLS)
entryPoints:
web:
address: ":80"
http:
redirections:
entryPoint:
to: websecure
scheme: https
websecure:
address: ":443"
http:
tls:
options: default
providers:
docker:
endpoint: "unix:///var/run/docker.sock"
exposedByDefault: false
network: "apa-frontend"
file:
filename: "/etc/traefik/traefik-dynamic.yml"
watch: true
api:
dashboard: true
insecure: true
serversTransport:
insecureSkipVerify: true
log:
level: INFO
accessLog: {}

8
infra/postgres/init/unleash.sh Executable file
View File

@@ -0,0 +1,8 @@
#!/bin/bash
set -e
psql -v ON_ERROR_STOP=1 --username "$POSTGRES_USER" --dbname "$POSTGRES_DB" <<-EOSQL
CREATE USER unleash WITH PASSWORD '${UNLEASH_DB_PASSWORD:-unleash}';
CREATE DATABASE unleash;
GRANT ALL PRIVILEGES ON DATABASE unleash TO unleash;
EOSQL

View File

@@ -112,6 +112,18 @@ echo ""
compose_cmd() {
local file=$1
shift
# For local environment, use the new unified compose.yaml
if [ "$ENVIRONMENT" = "local" ] && [ "$file" = "all" ]; then
docker compose -f "$INFRA_DIR/compose/compose.yaml" -f "$INFRA_DIR/compose/compose.override.yaml" --env-file "$ENV_FILE" --project-name "ai-tax-agent" "$@"
return
fi
# For other environments or specific stacks, keep existing behavior for now
# or adapt as needed. The goal is to eventually unify everything.
# If file is 'infrastructure.yaml', etc., we might still want to use base/
# directly for production to avoid local overrides.
docker compose -f "$BASE_DIR/$file" --env-file "$ENV_FILE" --project-name "ai-tax-agent-$ENVIRONMENT" "$@"
}
@@ -193,13 +205,18 @@ deploy_all() {
fi
# Deploy in order
deploy_infrastructure "$@"
sleep 5
if [ "$ENVIRONMENT" = "local" ]; then
log_info "Deploying unified stack for local environment..."
compose_cmd "all" up -d "$@"
else
deploy_infrastructure "$@"
sleep 5
deploy_monitoring "$@"
sleep 5
deploy_monitoring "$@"
sleep 5
deploy_services "$@"
deploy_services "$@"
fi
log_success "All stacks deployed successfully!"
echo ""

View File

@@ -0,0 +1,16 @@
{
"godaddy": {
"Account": {
"Email": "info@harkon.co.uk",
"Registration": {
"body": {
"status": "valid"
},
"uri": "https://acme-v02.api.letsencrypt.org/acme/acct/2826907666"
},
"PrivateKey": "MIIJKgIBAAKCAgEA3QhLjGI4WLdnFp7nJe0kaBZ1DCY7zr7aedlwnhCR5lBI+XINnDQCmc+rPM+Z2Ct55ru6LsmmPos80H9bmz858JhTnisJbmlxzXXFJNCqitohhSt5WhYas0fFJo5QIkt+GEnDKLB+Q4j6JETqEivuAE344NcahciESWW+aBRxFmaccjcLFCwU0xBr/5zkk1QyP8/e6s9YrmxskN1JFimJ/qdyb6jNgXkQ7Nx7QRtlcTFO4JkI16U+lba1TAMeUhBbJTH952Rjcc9zFkjDbfQZ0xydJgyhgqeBOVQSLKkdwA0LzjB8MZXprLUwqhMyhgv5Qo9HF+wuexyqwKFuO4KDRteFz0nla5g8dtb+xBUTgLjn3NapZZDtYhKCuPlMApJR8L/pIoEen26P0qdO8HwuykU8Mif9d4zwNfZFa/NuJ+veDppDBYv/BOe5Z6qA0UFchi4Cuh93K5iT/0S0hXI1mmHB1AN8lB5MBbz44iCnPwin2qR7lfIYGXOCX408TCU36sZtMsxf32dcgEq2klXeuY+C55kKI4OdRJsj+SejOla7uy3oqPGpY9sdWwqmWTXQtF+0hSm73e6iqv0RfqTdXuTkOXQDLlPxDG6b9cZJ0yeQoGlu23hYcSElmgCwCz2JjN6WYpXxCG3esFtaG2nVbJ+Jf1CxrsgyIhPmHr3Q3S8CAwEAAQKCAgA0GpV8lVbFCw7hFTpWBW30n36eC5FDrlfgK3LRwAQ0r65UJx+wN855JawvHJ0eiTkmPBCqoNxwl/AREkSs9x2YasAjY+/IOFEcZuu/PvVE4CDQvKvRoa5PntaJvTiErRkfbpvzxo8tKmgVDq3C9NoY9kh58BsPeHI+vx5AeLkj17J/dhxFeBK8on1i90Amvs1Nn5nj7lbwXxzElXV6JPajsiNW0QsIv1pPC7Z+ZY/nPAFlDo44D3sOXdClB4MpQzPJM9yvpEmQ9Z8inKp9C/LegjtFUers2sGqmvfh0UfzEuA6jdFo+vbnwJqlLPtXABGVMCNJL2LRoLNbz3Il0yFQrKoEkK2515QKq3hRo4oK1I9K0Ij1bIod0muC4TRQbpOp90nefcGv/Tquzb66guMDH8blYoVQ+zPtZaC0qFCLUsjh8OMRZv+f741OMICXcSMWSWMvMoRn4pntmmJrR1F3pDUgB5/25c26qFSKTnK9/lNtd90KrF6s2oRW5RDIy5lYXpn7p6tJ4HolMomJ2pRflmMDD8uGXZm9LP3CqfqLjSqmAlDtFCnT7EOkkKG84eyqhReaOTOf9XVGOl8ErxgZrt4UOF+3yorIQJ883V8BLn25rdDbM+cVWQIhh9SNzNP/QMDIYjQxvLnyx3WAtL+xQRCpHmp7/vrG8RxEHaB9cQKCAQEA6lGw699QY1S0hUWI/4fKzIaUkx6a+5NfL1FVsnsmTirdYpI3jue4ZMVguFXF8Loab3omWoVv0jPNIUtdciaIxFGWPbguF8vdMHdWM8mtUj2KgTz67Z3yDUX4dMQ9/FBPq2kJKna/Btp96k+0M8LN0OUE8rNC0jBrOG81wyIUv+02ah+HnzVoR9YciSlZ4ZfWSoigo+UJ4vPeB++1JoMsXfz4lUrLeQlSCY9yLx0Q652Hnd5/YKTjUnrLevopXg+VsWtfP0Q3uljWVLVO/EBkQ2StzNt/VmxtNwPVFXRL9YYkagBt7nI5QMu+XmQXukUnYop2o0u2wgpEeyC5aAVSaQKCAQEA8Xvh33PP2tiCjACyvkG/7Avrr7xWmN9IdXCiDQwfgwDniTip1GahU69NQWuIV0yebDgb/Dg5kLsbZ5ebDpMKbWx6DjZ1hS8t5M6Kux9nYZDVQZosRIe9fwMwrl23obI0h5JfF8rhxZ+wUhG/COVc5qyEehSB9on0CivyNGzOi/thn8oxXw+g3lXtCFiJM3cfRpd1fb5gP+dpab7VzBy7TjJapifs3ST2/TmmkgYZv5xGbdqbgSz3LbEiC5LiCtrUqyH4kpHr6Fhq8DN7R/nY/CakbB06N2SLytrrth+AF1DGakc563mj5RRpY7X/zdkdcIhJGk6lqQQOx8MSe9CP1wKCAQEAvUXjjYRDYRkpAIYclZxQukjzdqtAMXrnZkdi29sSJA4H6fmGG08d6XhuGjhevYb2l5mppXEn1Dm3tu8zumNaEop8u7ossVghgWbEIO0Freq8GIzzfEEbJpGgkmF6WHdfA2zC1KQ6xgRztXNQcocmzVhRWOJoVXR7B4j9enPrIuUwESUK3hW7+FsBjeHzEoEdvfMDH6CBDexDK1H7l/JZQkp3WdCi71ASDlrqtxfZdRk4VNNHPP+0CAncl6e/BpW8KyY6N9aY1VOxPZd/B8/TrYSDx3h+MYc/6TKVStE4Ekma3G0gX32wtaBeU8yyRepaWATUtC8Sn0a/7l2OpnG2EQKCAQEAtEnaM/sCBxC4PpBS4qqyAChSOSzytkWVkmCaDAWuDR+Cvbc5TCOndJQfqKUA8LR6Xq9xbVgI2l5nMmtEz5fGJDXl1nCgQuQbboUpnFTw2S3JmaXiQPPa7VXTZYsAi09B2qnUJy5Ia0Qy3sLzDlA3kNziN0bSVN9f/Kwcszk859OxahwJykAfyX77bcyz+mGITyrLBCs7Ltq1n8ZjVnVo/hOoC/8o3142rI37J3A4jw68ok2g5ctNa6aglWV/L717I51EOSGKsDg69sRo2S7W6kJrZXBYw3xkxfm2G43fEwkyaaxtuLljPKeFm3UI24WqbhbCBUsMcWhfJJMmXJw0lwKCAQEArJ09I6B7g/5G8Ce5G1FTgakrxpbOerAVjFS529CpV/56B9Ml0Gw2/0M6ed+xYQovEHe+r3nCy4LfH2+6YDHgOzo5ZqM4W3MLDCzTYbnQaS8FlDtuOdX9wXsCacpOk/Av9X9YS7mROYMW8F38jU0A4ZR2/gO3paOchXAMvx8ZwrH9Dk7pwAFYkIDdFhWadHo7q4w7raCkcaa4C0IkjFogW/GPfKuMUduNrZ011xJCSyeqZFJdo8YQnVfLAuBQYQO7UMwLgKUaSJp/L9jttYN1NibqGrHIVYaggDaVOmNcfXdOe8uTxsaqaNe0v0WVHVfOkKokHt+thA6+BSHyIzy76w==",
"KeyType": "4096"
},
"Certificates": null
}
}

View File

@@ -0,0 +1,64 @@
http:
middlewares:
authentik-forwardauth:
forwardAuth:
address: "http://apa-authentik-outpost:9000/outpost.goauthentik.io/auth/traefik"
trustForwardHeader: true
authResponseHeaders:
- X-authentik-username
- X-authentik-groups
- X-authentik-email
- X-authentik-name
- X-authentik-uid
- X-authentik-jwt
- X-authentik-meta-jwks
- X-authentik-meta-outpost
- X-authentik-meta-provider
- X-authentik-meta-app
- X-authentik-meta-version
# Large upload middleware for Gitea registry
gitea-large-upload:
buffering:
maxRequestBodyBytes: 5368709120 # 5GB
memRequestBodyBytes: 104857600 # 100MB
maxResponseBodyBytes: 5368709120 # 5GB
memResponseBodyBytes: 104857600 # 100MB
retryExpression: "IsNetworkError() && Attempts() < 3"
# Rate limiting for public APIs
api-ratelimit:
rateLimit:
average: 100
burst: 50
period: 1s
# Security headers
security-headers:
headers:
frameDeny: true
sslRedirect: true
browserXssFilter: true
contentTypeNosniff: true
stsIncludeSubdomains: true
stsPreload: true
stsSeconds: 31536000
# CORS headers
api-cors:
headers:
accessControlAllowMethods:
- GET
- POST
- PUT
- DELETE
- OPTIONS
accessControlAllowOriginList:
- "https://app.harkon.co.uk"
accessControlAllowHeaders:
- "Content-Type"
- "Authorization"
accessControlMaxAge: 100
addVaryHeader: true
# Security headers

View File

@@ -0,0 +1,35 @@
# Static Traefik configuration (production)
entryPoints:
web:
address: ":80"
websecure:
address: ":443"
transport:
respondingTimeouts:
readTimeout: 30m
api:
dashboard: true
providers:
docker:
endpoint: "unix:///var/run/docker.sock"
exposedByDefault: false
network: "apa-frontend"
file:
filename: "/etc/traefik/traefik-dynamic.yml"
watch: true
# -- Configure your CertificateResolver here...
certificatesResolvers:
godaddy:
acme:
email: info@harkon.co.uk
storage: /var/traefik/certs/godaddy-acme.json
caServer: "https://acme-v02.api.letsencrypt.org/directory"
dnsChallenge:
provider: godaddy
resolvers:
- 1.1.1.1:53
- 8.8.8.8:53
- 97.74.103.44:53
- 173.201.71.44:53

View File

@@ -1,7 +1,6 @@
"""Configuration management and client factories."""
from .factories import (
EventBusFactory,
MinIOClientFactory,
Neo4jDriverFactory,
QdrantClientFactory,
@@ -28,7 +27,6 @@ __all__ = [
"QdrantClientFactory",
"Neo4jDriverFactory",
"RedisClientFactory",
"EventBusFactory",
"get_settings",
"init_settings",
"create_vault_client",

View File

@@ -2,10 +2,8 @@
from typing import Any
import boto3 # type: ignore
import hvac
import redis.asyncio as redis
from aiokafka import AIOKafkaConsumer, AIOKafkaProducer # type: ignore
from minio import Minio
from neo4j import GraphDatabase
from qdrant_client import QdrantClient
@@ -87,36 +85,3 @@ class RedisClientFactory: # pylint: disable=too-few-public-methods
return redis.from_url(
settings.redis_url, encoding="utf-8", decode_responses=True
)
class EventBusFactory:
"""Factory for creating event bus clients"""
@staticmethod
def create_kafka_producer(settings: BaseAppSettings) -> AIOKafkaProducer:
"""Create Kafka producer"""
return AIOKafkaProducer(
bootstrap_servers=settings.kafka_bootstrap_servers,
value_serializer=lambda v: v.encode("utf-8") if isinstance(v, str) else v,
)
@staticmethod
def create_kafka_consumer(
settings: BaseAppSettings, topics: list[str]
) -> AIOKafkaConsumer:
"""Create Kafka consumer"""
return AIOKafkaConsumer(
*topics,
bootstrap_servers=settings.kafka_bootstrap_servers,
value_deserializer=lambda m: m.decode("utf-8") if m else None,
)
@staticmethod
def create_sqs_client(settings: BaseAppSettings) -> Any:
"""Create SQS client"""
return boto3.client("sqs", region_name=settings.aws_region)
@staticmethod
def create_sns_client(settings: BaseAppSettings) -> Any:
"""Create SNS client"""
return boto3.client("sns", region_name=settings.aws_region)

View File

@@ -8,7 +8,7 @@ class BaseAppSettings(BaseSettings):
"""Base settings class for all services"""
model_config = SettingsConfigDict(
env_file=".env", env_file_encoding="utf-8", case_sensitive=True, extra="ignore"
env_file=".env", env_file_encoding="utf-8", case_sensitive=False, extra="ignore"
)
# Service identification

View File

@@ -67,27 +67,20 @@ async def create_redis_client(settings: BaseAppSettings) -> "redis.Redis[str]":
def create_event_bus(settings: BaseAppSettings) -> EventBus:
"""Create event bus"""
if settings.event_bus_type.lower() == "kafka":
# pylint: disable=import-outside-toplevel
from ..events import KafkaEventBus
return KafkaEventBus(settings.kafka_bootstrap_servers)
if settings.event_bus_type.lower() == "sqs":
# pylint: disable=import-outside-toplevel
from ..events import SQSEventBus
return SQSEventBus(settings.aws_region)
if settings.event_bus_type.lower() == "memory":
# pylint: disable=import-outside-toplevel
from ..events import MemoryEventBus
return MemoryEventBus()
# Default to memory bus for unknown types
# pylint: disable=import-outside-toplevel
from ..events import MemoryEventBus
from libs.events import create_event_bus as _create_event_bus
return MemoryEventBus()
# Extract NATS servers as a list
nats_servers = [s.strip() for s in settings.nats_servers.split(",")]
return _create_event_bus(
settings.event_bus_type,
servers=nats_servers,
stream_name=settings.nats_stream_name,
consumer_group=settings.nats_consumer_group,
bootstrap_servers=settings.kafka_bootstrap_servers,
region_name=settings.aws_region,
)
def get_default_settings(**overrides: Any) -> BaseAppSettings:

View File

@@ -1,20 +1,52 @@
"""Event-driven architecture with Kafka, SQS, NATS, and Memory support."""
from libs.schemas.events import (
EVENT_SCHEMA_MAP,
BaseEventData,
CalculationReadyEventData,
DocumentExtractedEventData,
DocumentIngestedEventData,
DocumentOCRReadyEventData,
FirmSyncCompletedEventData,
FormFilledEventData,
HMRCSubmittedEventData,
KGUpsertedEventData,
KGUpsertReadyEventData,
RAGIndexedEventData,
ReviewCompletedEventData,
ReviewRequestedEventData,
get_schema_for_topic,
validate_event_data,
)
from .base import EventBus, EventPayload
from .factory import create_event_bus
from .kafka_bus import KafkaEventBus
from .memory_bus import MemoryEventBus
from .nats_bus import NATSEventBus
from .sqs_bus import SQSEventBus
from .topics import EventTopics
__all__ = [
"EventPayload",
"EventBus",
"KafkaEventBus",
"MemoryEventBus",
"NATSEventBus",
"SQSEventBus",
"create_event_bus",
"EventTopics",
# Event schemas
"BaseEventData",
"DocumentIngestedEventData",
"DocumentOCRReadyEventData",
"DocumentExtractedEventData",
"KGUpsertReadyEventData",
"KGUpsertedEventData",
"RAGIndexedEventData",
"CalculationReadyEventData",
"FormFilledEventData",
"HMRCSubmittedEventData",
"ReviewRequestedEventData",
"ReviewCompletedEventData",
"FirmSyncCompletedEventData",
"EVENT_SCHEMA_MAP",
"validate_event_data",
"get_schema_for_topic",
]

View File

@@ -3,7 +3,7 @@
import json
from abc import ABC, abstractmethod
from collections.abc import Awaitable, Callable
from datetime import datetime
from datetime import UTC, datetime
from typing import Any
import ulid
@@ -22,7 +22,7 @@ class EventPayload:
schema_version: str = "1.0",
):
self.event_id = str(ulid.new())
self.occurred_at = datetime.utcnow().isoformat() + "Z"
self.occurred_at = datetime.now(UTC).isoformat()
self.actor = actor
self.tenant_id = tenant_id
self.trace_id = trace_id

View File

@@ -7,7 +7,7 @@ from collections.abc import Awaitable, Callable
import structlog
from aiokafka import AIOKafkaConsumer, AIOKafkaProducer # type: ignore
from .base import EventBus, EventPayload
from ..base import EventBus, EventPayload
logger = structlog.get_logger()

View File

@@ -9,7 +9,7 @@ import boto3 # type: ignore
import structlog
from botocore.exceptions import ClientError # type: ignore
from .base import EventBus, EventPayload
from ..base import EventBus, EventPayload
logger = structlog.get_logger()

271
libs/events/dlq.py Normal file
View File

@@ -0,0 +1,271 @@
"""Dead Letter Queue (DLQ) handler for failed event processing."""
import asyncio
import json
from datetime import UTC, datetime
from typing import Any
import structlog
from nats.js import JetStreamContext
from .base import EventPayload
logger = structlog.get_logger()
class DLQHandler:
"""
Dead Letter Queue handler for processing failed events.
Captures events that fail processing after max retries and stores them
in a separate NATS stream for manual review and retry.
"""
def __init__(
self,
js: JetStreamContext,
dlq_stream_name: str = "TAX_AGENT_DLQ",
max_retries: int = 3,
backoff_base_ms: int = 1000,
backoff_multiplier: float = 2.0,
backoff_max_ms: int = 30000,
):
"""
Initialize DLQ handler.
Args:
js: NATS JetStream context
dlq_stream_name: Name of the DLQ stream
max_retries: Maximum number of retry attempts
backoff_base_ms: Base backoff time in milliseconds
backoff_multiplier: Exponential backoff multiplier
backoff_max_ms: Maximum backoff time in milliseconds
"""
self.js = js
self.dlq_stream_name = dlq_stream_name
self.max_retries = max_retries
self.backoff_base_ms = backoff_base_ms
self.backoff_multiplier = backoff_multiplier
self.backoff_max_ms = backoff_max_ms
async def ensure_dlq_stream_exists(self) -> None:
"""Ensure DLQ stream exists in JetStream."""
try:
# Try to get stream info
await self.js.stream_info(self.dlq_stream_name)
logger.debug("DLQ stream already exists", stream=self.dlq_stream_name)
except Exception:
# Stream doesn't exist, create it
try:
await self.js.add_stream(
name=self.dlq_stream_name,
subjects=[f"{self.dlq_stream_name}.>"],
# Keep DLQ messages for 30 days
max_age=30 * 24 * 60 * 60, # 30 days in seconds
)
logger.info("Created DLQ stream", stream=self.dlq_stream_name)
except Exception as e:
logger.error(
"Failed to create DLQ stream",
stream=self.dlq_stream_name,
error=str(e),
)
raise
async def send_to_dlq(
self,
topic: str,
payload: EventPayload,
error: Exception,
retry_count: int,
original_message_data: bytes | None = None,
) -> None:
"""
Send failed event to DLQ.
Args:
topic: Original topic name
payload: Event payload
error: Exception that caused the failure
retry_count: Number of retry attempts made
original_message_data: Original message data (optional, for debugging)
"""
try:
# Create DLQ subject
dlq_subject = f"{self.dlq_stream_name}.{topic}"
# Create DLQ payload with metadata
dlq_payload = {
"original_topic": topic,
"original_payload": payload.to_dict(),
"error": {
"type": type(error).__name__,
"message": str(error),
},
"retry_count": retry_count,
"failed_at": datetime.now(UTC).isoformat(),
"tenant_id": payload.tenant_id,
"event_id": payload.event_id,
"trace_id": payload.trace_id,
}
# Add original message data if available
if original_message_data:
try:
dlq_payload["original_message_data"] = original_message_data.decode(
"utf-8"
)
except UnicodeDecodeError:
dlq_payload["original_message_data"] = "<binary data>"
# Publish to DLQ
headers = {
"original_topic": topic,
"tenant_id": payload.tenant_id,
"event_id": payload.event_id,
"error_type": type(error).__name__,
"retry_count": str(retry_count),
}
await self.js.publish(
subject=dlq_subject,
payload=json.dumps(dlq_payload).encode(),
headers=headers,
)
logger.error(
"Event sent to DLQ",
topic=topic,
event_id=payload.event_id,
error=str(error),
retry_count=retry_count,
dlq_subject=dlq_subject,
)
except Exception as dlq_error:
logger.critical(
"Failed to send event to DLQ - EVENT LOST",
topic=topic,
event_id=payload.event_id,
original_error=str(error),
dlq_error=str(dlq_error),
)
def calculate_backoff(self, retry_count: int) -> float:
"""
Calculate exponential backoff delay.
Args:
retry_count: Current retry attempt (0-indexed)
Returns:
Backoff delay in seconds
"""
# Calculate exponential backoff: base * (multiplier ^ retry_count)
backoff_ms = self.backoff_base_ms * (self.backoff_multiplier**retry_count)
# Cap at maximum backoff
backoff_ms = min(backoff_ms, self.backoff_max_ms)
# Convert to seconds
return backoff_ms / 1000.0
async def retry_with_backoff(
self,
func: Any,
*args: Any,
**kwargs: Any,
) -> tuple[bool, Exception | None]:
"""
Retry a function with exponential backoff.
Args:
func: Async function to retry
*args: Position arguments for the function
**kwargs: Keyword arguments for the function
Returns:
Tuple of (success: bool, last_error: Exception | None)
"""
last_error: Exception | None = None
for attempt in range(self.max_retries + 1):
try:
await func(*args, **kwargs)
return (True, None)
except Exception as e: # pylint: disable=broad-exception-caught
last_error = e
if attempt < self.max_retries:
# Calculate and apply backoff
backoff_seconds = self.calculate_backoff(attempt)
logger.warning(
"Retry attempt failed, backing off",
attempt=attempt + 1,
max_retries=self.max_retries,
backoff_seconds=backoff_seconds,
error=str(e),
)
await asyncio.sleep(backoff_seconds)
else:
logger.error(
"All retry attempts exhausted",
attempts=self.max_retries + 1,
error=str(e),
)
return (False, last_error)
class DLQMetrics:
"""Metrics for DLQ operations."""
def __init__(self) -> None:
"""Initialize DLQ metrics."""
self.total_dlq_events = 0
self.dlq_events_by_topic: dict[str, int] = {}
self.dlq_events_by_error_type: dict[str, int] = {}
def record_dlq_event(self, topic: str, error_type: str) -> None:
"""
Record a DLQ event.
Args:
topic: Original topic name
error_type: Type of error that caused DLQ
"""
self.total_dlq_events += 1
# Track by topic
if topic not in self.dlq_events_by_topic:
self.dlq_events_by_topic[topic] = 0
self.dlq_events_by_topic[topic] += 1
# Track by error type
if error_type not in self.dlq_events_by_error_type:
self.dlq_events_by_error_type[error_type] = 0
self.dlq_events_by_error_type[error_type] += 1
def get_metrics(self) -> dict[str, Any]:
"""
Get DLQ metrics.
Returns:
Dictionary of metrics
"""
return {
"total_dlq_events": self.total_dlq_events,
"by_topic": self.dlq_events_by_topic.copy(),
"by_error_type": self.dlq_events_by_error_type.copy(),
}
def reset(self) -> None:
"""Reset all metrics to zero."""
self.total_dlq_events = 0
self.dlq_events_by_topic.clear()
self.dlq_events_by_error_type.clear()

View File

@@ -3,16 +3,20 @@
from typing import Any
from .base import EventBus
from .kafka_bus import KafkaEventBus
from .nats_bus import NATSEventBus
from .sqs_bus import SQSEventBus
def create_event_bus(bus_type: str, **kwargs: Any) -> EventBus:
"""Factory function to create event bus"""
if bus_type.lower() == "kafka":
# Lazy import to avoid ModuleNotFoundError when aiokafka is not installed
from .contrib.kafka_bus import KafkaEventBus
return KafkaEventBus(kwargs.get("bootstrap_servers", "localhost:9092"))
if bus_type.lower() == "sqs":
# Lazy import to avoid ModuleNotFoundError when boto3 is not installed
from .contrib.sqs_bus import SQSEventBus
return SQSEventBus(kwargs.get("region_name", "us-east-1"))
if bus_type.lower() == "nats":
return NATSEventBus(

225
libs/events/metrics.py Normal file
View File

@@ -0,0 +1,225 @@
"""Prometheus metrics for event bus monitoring."""
from prometheus_client import Counter, Histogram
from prometheus_client.registry import CollectorRegistry
# Global registry for event metrics
_event_registry = CollectorRegistry()
# Event publishing metrics
event_published_total = Counter(
"event_published_total",
"Total number of events published",
["topic"],
registry=_event_registry,
)
event_publish_errors_total = Counter(
"event_publish_errors_total",
"Total number of event publishing errors",
["topic", "error_type"],
registry=_event_registry,
)
event_publishing_duration_seconds = Histogram(
"event_publishing_duration_seconds",
"Time spent publishing events in seconds",
["topic"],
buckets=(0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0),
registry=_event_registry,
)
# Event consumption metrics
event_consumed_total = Counter(
"event_consumed_total",
"Total number of events consumed",
["topic", "consumer_group"],
registry=_event_registry,
)
event_processing_duration_seconds = Histogram(
"event_processing_duration_seconds",
"Time spent processing events in seconds",
["topic", "consumer_group"],
buckets=(0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 30.0, 60.0),
registry=_event_registry,
)
event_processing_errors_total = Counter(
"event_processing_errors_total",
"Total number of event processing errors",
["topic", "consumer_group", "error_type"],
registry=_event_registry,
)
# DLQ metrics
event_dlq_total = Counter(
"event_dlq_total",
"Total number of events sent to dead letter queue",
["topic", "error_type"],
registry=_event_registry,
)
event_retry_total = Counter(
"event_retry_total",
"Total number of event retry attempts",
["topic", "retry_attempt"],
registry=_event_registry,
)
# Schema validation metrics
event_schema_validation_errors_total = Counter(
"event_schema_validation_errors_total",
"Total number of event schema validation errors",
["topic", "validation_error"],
registry=_event_registry,
)
# NATS JetStream specific metrics
nats_stream_messages_total = Counter(
"nats_stream_messages_total",
"Total messages in NATS stream",
["stream_name"],
registry=_event_registry,
)
nats_consumer_lag_messages = Histogram(
"nats_consumer_lag_messages",
"Number of messages consumer is lagging behind",
["stream_name", "consumer_group"],
buckets=(0, 1, 5, 10, 25, 50, 100, 250, 500, 1000, 5000, 10000),
registry=_event_registry,
)
def get_event_metrics_registry() -> CollectorRegistry:
"""
Get the Prometheus registry for event metrics.
Returns:
CollectorRegistry for event metrics
"""
return _event_registry
class EventMetricsCollector:
"""Helper class for collecting event metrics."""
@staticmethod
def record_publish(
topic: str,
duration_seconds: float,
success: bool = True,
error_type: str | None = None,
) -> None:
"""
Record event publishing metrics.
Args:
topic: Event topic name
duration_seconds: Time taken to publish
success: Whether publishing succeeded
error_type: Type of error if failed
"""
if success:
event_published_total.labels(topic=topic).inc()
else:
event_publish_errors_total.labels(
topic=topic, error_type=error_type or "unknown"
).inc()
event_publishing_duration_seconds.labels(topic=topic).observe(duration_seconds)
@staticmethod
def record_consume(
topic: str,
consumer_group: str,
duration_seconds: float,
success: bool = True,
error_type: str | None = None,
) -> None:
"""
Record event consumption metrics.
Args:
topic: Event topic name
consumer_group: Consumer group name
duration_seconds: Time taken to process event
success: Whether processing succeeded
error_type: Type of error if failed
"""
if success:
event_consumed_total.labels(
topic=topic, consumer_group=consumer_group
).inc()
else:
event_processing_errors_total.labels(
topic=topic,
consumer_group=consumer_group,
error_type=error_type or "unknown",
).inc()
event_processing_duration_seconds.labels(
topic=topic, consumer_group=consumer_group
).observe(duration_seconds)
@staticmethod
def record_dlq(topic: str, error_type: str) -> None:
"""
Record event sent to DLQ.
Args:
topic: Event topic name
error_type: Type of error that caused DLQ
"""
event_dlq_total.labels(topic=topic, error_type=error_type).inc()
@staticmethod
def record_retry(topic: str, retry_attempt: int) -> None:
"""
Record event retry attempt.
Args:
topic: Event topic name
retry_attempt: Retry attempt number (1-indexed)
"""
event_retry_total.labels(topic=topic, retry_attempt=str(retry_attempt)).inc()
@staticmethod
def record_schema_validation_error(topic: str, validation_error: str) -> None:
"""
Record schema validation error.
Args:
topic: Event topic name
validation_error: Type of validation error
"""
event_schema_validation_errors_total.labels(
topic=topic, validation_error=validation_error
).inc()
@staticmethod
def record_nats_stream_message(stream_name: str) -> None:
"""
Record message added to NATS stream.
Args:
stream_name: NATS stream name
"""
nats_stream_messages_total.labels(stream_name=stream_name).inc()
@staticmethod
def record_consumer_lag(
stream_name: str, consumer_group: str, lag_messages: int
) -> None:
"""
Record consumer lag.
Args:
stream_name: NATS stream name
consumer_group: Consumer group name
lag_messages: Number of messages consumer is behind
"""
nats_consumer_lag_messages.labels(
stream_name=stream_name, consumer_group=consumer_group
).observe(lag_messages)

View File

@@ -2,6 +2,7 @@
import asyncio
import json
import time
from collections.abc import Awaitable, Callable
from typing import Any
@@ -12,6 +13,8 @@ from nats.js import JetStreamContext
from nats.js.api import AckPolicy, ConsumerConfig, DeliverPolicy
from .base import EventBus, EventPayload
from .dlq import DLQHandler
from .metrics import EventMetricsCollector
logger = structlog.get_logger()
@@ -24,6 +27,8 @@ class NATSEventBus(EventBus): # pylint: disable=too-many-instance-attributes
servers: str | list[str] = "nats://localhost:4222",
stream_name: str = "TAX_AGENT_EVENTS",
consumer_group: str = "tax-agent",
dlq_stream_name: str = "TAX_AGENT_DLQ",
max_retries: int = 3,
):
if isinstance(servers, str):
self.servers = [servers]
@@ -32,8 +37,13 @@ class NATSEventBus(EventBus): # pylint: disable=too-many-instance-attributes
self.stream_name = stream_name
self.consumer_group = consumer_group
self.dlq_stream_name = dlq_stream_name
self.max_retries = max_retries
self.nc: NATS | None = None
self.js: JetStreamContext | None = None
self.dlq: DLQHandler | None = None
self.handlers: dict[
str, list[Callable[[str, EventPayload], Awaitable[None]]]
] = {}
@@ -48,19 +58,32 @@ class NATSEventBus(EventBus): # pylint: disable=too-many-instance-attributes
try:
# Connect to NATS
self.nc = await nats.connect(servers=self.servers)
self.nc = await nats.connect(
servers=self.servers,
connect_timeout=10,
reconnect_time_wait=1,
)
# Get JetStream context
self.js = self.nc.jetstream()
self.js = self.nc.jetstream(timeout=10)
# Ensure stream exists
# Initialize DLQ handler
self.dlq = DLQHandler(
js=self.js,
dlq_stream_name=self.dlq_stream_name,
max_retries=self.max_retries,
)
# Ensure streams exist
await self._ensure_stream_exists()
await self.dlq.ensure_dlq_stream_exists()
self.running = True
logger.info(
"NATS event bus started",
servers=self.servers,
stream=self.stream_name,
dlq_stream=self.dlq_stream_name,
)
except Exception as e:
@@ -98,6 +121,7 @@ class NATSEventBus(EventBus): # pylint: disable=too-many-instance-attributes
if not self.js:
raise RuntimeError("Event bus not started")
start_time = time.perf_counter()
try:
# Create subject name from topic
subject = f"{self.stream_name}.{topic}"
@@ -117,6 +141,13 @@ class NATSEventBus(EventBus): # pylint: disable=too-many-instance-attributes
headers=headers,
)
duration = time.perf_counter() - start_time
EventMetricsCollector.record_publish(
topic=topic,
duration_seconds=duration,
success=True,
)
logger.info(
"Event published",
topic=topic,
@@ -127,6 +158,14 @@ class NATSEventBus(EventBus): # pylint: disable=too-many-instance-attributes
return True
except Exception as e: # pylint: disable=broad-exception-caught
duration = time.perf_counter() - start_time
EventMetricsCollector.record_publish(
topic=topic,
duration_seconds=duration,
success=False,
error_type=type(e).__name__,
)
logger.error(
"Failed to publish event",
topic=topic,
@@ -152,9 +191,13 @@ class NATSEventBus(EventBus): # pylint: disable=too-many-instance-attributes
subject = f"{self.stream_name}.{topic}"
# Create durable consumer
consumer_name = f"{self.consumer_group}-{topic}"
# Durable names cannot contain dots, so we replace them
safe_topic = topic.replace(".", "-")
consumer_name = f"{self.consumer_group}-{safe_topic}"
# Subscribe with pull-based consumer
# Set max_deliver to max_retries + 1 (initial + retries)
# We handle DLQ manually before NATS gives up
subscription = await self.js.pull_subscribe(
subject=subject,
durable=consumer_name,
@@ -162,7 +205,7 @@ class NATSEventBus(EventBus): # pylint: disable=too-many-instance-attributes
durable_name=consumer_name,
ack_policy=AckPolicy.EXPLICIT,
deliver_policy=DeliverPolicy.NEW,
max_deliver=3,
max_deliver=self.max_retries + 2, # Give us room to handle DLQ
ack_wait=30, # 30 seconds
),
)
@@ -193,13 +236,14 @@ class NATSEventBus(EventBus): # pylint: disable=too-many-instance-attributes
# Try to get stream info
await self.js.stream_info(self.stream_name)
logger.debug("Stream already exists", stream=self.stream_name)
EventMetricsCollector.record_nats_stream_message(self.stream_name)
except Exception:
# Stream doesn't exist, create it
try:
await self.js.add_stream(
name=self.stream_name,
subjects=[f"{self.stream_name}.*"],
subjects=[f"{self.stream_name}.>"],
)
logger.info("Created JetStream stream", stream=self.stream_name)
@@ -214,12 +258,17 @@ class NATSEventBus(EventBus): # pylint: disable=too-many-instance-attributes
while self.running:
try:
# Fetch messages in batches
messages = await subscription.fetch(batch=10, timeout=20)
messages = await subscription.fetch(batch=10, timeout=5)
for message in messages:
start_time = time.perf_counter()
payload = None
try:
print(f"DEBUG: Received message: {message.data}")
# Parse message payload
payload_dict = json.loads(message.data.decode())
print(f"DEBUG: Parsed payload: {payload_dict}")
payload = EventPayload(
data=payload_dict["data"],
@@ -230,38 +279,87 @@ class NATSEventBus(EventBus): # pylint: disable=too-many-instance-attributes
)
payload.event_id = payload_dict["event_id"]
payload.occurred_at = payload_dict["occurred_at"]
print(f"DEBUG: Reconstructed payload: {payload.event_id}")
# Call all handlers for this topic
for handler in self.handlers.get(topic, []):
try:
await handler(topic, payload)
except (
Exception
) as e: # pylint: disable=broad-exception-caught
logger.error(
"Handler failed",
topic=topic,
event_id=payload.event_id,
error=str(e),
)
print(f"DEBUG: Calling handler for topic {topic}")
await handler(topic, payload)
# Acknowledge message
await message.ack()
print("DEBUG: Message acked")
except json.JSONDecodeError as e:
logger.error(
"Failed to decode message", topic=topic, error=str(e)
# Record metrics
duration = time.perf_counter() - start_time
EventMetricsCollector.record_consume(
topic=topic,
consumer_group=self.consumer_group,
duration_seconds=duration,
success=True,
)
await message.nak()
except Exception as e: # pylint: disable=broad-exception-caught
logger.error(
"Failed to process message", topic=topic, error=str(e)
duration = time.perf_counter() - start_time
error_type = type(e).__name__
# Record failure metric
EventMetricsCollector.record_consume(
topic=topic,
consumer_group=self.consumer_group,
duration_seconds=duration,
success=False,
error_type=error_type,
)
await message.nak()
# Check delivery count for DLQ
try:
metadata = message.metadata
num_delivered = (
metadata.sequence.consumer
) # This might be wrong, check docs
# Actually nats-py MsgMetadata has num_delivered
num_delivered = metadata.num_delivered
except Exception:
num_delivered = 1
if num_delivered >= self.max_retries:
logger.error(
"Max retries exceeded, sending to DLQ",
topic=topic,
event_id=payload.event_id if payload else "unknown",
error=str(e),
num_delivered=num_delivered,
)
if self.dlq and payload:
await self.dlq.send_to_dlq(
topic=topic,
payload=payload,
error=e,
retry_count=num_delivered,
original_message_data=message.data,
)
EventMetricsCollector.record_dlq(topic, error_type)
# Ack to remove from main stream
await message.ack()
else:
# Retry (Nak)
logger.warning(
"Processing failed, retrying",
topic=topic,
event_id=payload.event_id if payload else "unknown",
error=str(e),
attempt=num_delivered,
)
EventMetricsCollector.record_retry(topic, num_delivered)
await message.nak()
except TimeoutError:
# No messages available, continue polling
continue
except Exception as e: # pylint: disable=broad-exception-caught
logger.error("Consumer error", topic=topic, error=str(e))
await asyncio.sleep(5) # Wait before retrying
await asyncio.sleep(1) # Wait before retrying

View File

@@ -7,6 +7,7 @@ class EventTopics: # pylint: disable=too-few-public-methods
DOC_INGESTED = "doc.ingested"
DOC_OCR_READY = "doc.ocr_ready"
DOC_EXTRACTED = "doc.extracted"
KG_UPSERT_READY = "kg.upsert.ready"
KG_UPSERTED = "kg.upserted"
RAG_INDEXED = "rag.indexed"
CALC_SCHEDULE_READY = "calc.schedule_ready"

View File

@@ -11,8 +11,8 @@ psycopg2-binary>=2.9.11
neo4j>=6.0.2
redis[hiredis]>=6.4.0
# Object storage and vector database
minio>=7.2.18
boto3>=1.34.0
qdrant-client>=1.15.1
# Event streaming (NATS only - removed Kafka)
@@ -36,3 +36,13 @@ python-multipart>=0.0.20
python-dateutil>=2.9.0
python-dotenv>=1.1.1
orjson>=3.11.3
jsonschema>=4.20.0
# OpenTelemetry instrumentation (for observability)
opentelemetry-api>=1.21.0
opentelemetry-sdk>=1.21.0
opentelemetry-exporter-otlp-proto-grpc>=1.21.0
opentelemetry-instrumentation-fastapi>=0.42b0
opentelemetry-instrumentation-httpx>=0.42b0
opentelemetry-instrumentation-psycopg2>=0.42b0
opentelemetry-instrumentation-redis>=0.42b0

View File

@@ -65,6 +65,26 @@ from .enums import (
# Import error models
from .errors import ErrorResponse, ValidationError, ValidationErrorResponse
# Import event schemas
from .events import (
EVENT_SCHEMA_MAP,
BaseEventData,
CalculationReadyEventData,
DocumentExtractedEventData,
DocumentIngestedEventData,
DocumentOCRReadyEventData,
FirmSyncCompletedEventData,
FormFilledEventData,
HMRCSubmittedEventData,
KGUpsertedEventData,
KGUpsertReadyEventData,
RAGIndexedEventData,
ReviewCompletedEventData,
ReviewRequestedEventData,
get_schema_for_topic,
validate_event_data,
)
# Import health models
from .health import HealthCheck, ServiceHealth
@@ -135,7 +155,7 @@ __all__ = [
"DocumentUploadResponse",
"ExtractionResponse",
"FirmSyncResponse",
"HMRCSubmissionResponse",
"HMRCSubmittedEventData",
"RAGSearchResponse",
"ScheduleComputeResponse",
# Utils
@@ -172,4 +192,21 @@ __all__ = [
"ValidationResult",
"PolicyVersion",
"CoverageAudit",
# Event schemas
"BaseEventData",
"DocumentIngestedEventData",
"DocumentOCRReadyEventData",
"DocumentExtractedEventData",
"KGUpsertReadyEventData",
"KGUpsertedEventData",
"RAGIndexedEventData",
"CalculationReadyEventData",
"FormFilledEventData",
"HMRCSubmittedEventData",
"ReviewRequestedEventData",
"ReviewCompletedEventData",
"FirmSyncCompletedEventData",
"EVENT_SCHEMA_MAP",
"validate_event_data",
"get_schema_for_topic",
]

309
libs/schemas/events.py Normal file
View File

@@ -0,0 +1,309 @@
"""Typed event payload schemas for validation and type safety."""
from typing import Any, Literal
from pydantic import BaseModel, ConfigDict, Field, field_validator
# Base schema for all events
class BaseEventData(BaseModel):
"""Base class for all event data payloads."""
model_config = ConfigDict(
extra="forbid", # Prevent unexpected fields
frozen=True, # Make immutable
)
# Document lifecycle events
class DocumentIngestedEventData(BaseEventData):
"""Event emitted when a document is successfully ingested."""
doc_id: str = Field(..., description="Unique document identifier (ULID)")
filename: str = Field(..., description="Original filename")
mime_type: str = Field(..., description="MIME type of the document")
size_bytes: int = Field(..., ge=0, description="File size in bytes")
checksum_sha256: str = Field(..., description="SHA-256 checksum for integrity")
kind: str = Field(
..., description="Document kind (invoice, receipt, bank_statement, etc.)"
)
source: str = Field(
..., description="Ingestion source (manual_upload, rpa, email, api)"
)
storage_path: str = Field(..., description="MinIO object storage path")
metadata: dict[str, Any] = Field(
default_factory=dict, description="Additional metadata"
)
@field_validator("checksum_sha256")
@classmethod
def validate_checksum(cls, v: str) -> str:
"""Validate SHA-256 checksum format."""
if len(v) != 64 or not all(c in "0123456789abcdef" for c in v.lower()):
raise ValueError("Invalid SHA-256 checksum format")
return v.lower()
class DocumentOCRReadyEventData(BaseEventData):
"""Event emitted when OCR processing is complete."""
doc_id: str = Field(..., description="Document identifier")
ocr_engine: Literal["tesseract", "textract", "azure_ocr"] = Field(
..., description="OCR engine used"
)
page_count: int = Field(..., ge=1, description="Number of pages processed")
confidence_avg: float = Field(
..., ge=0.0, le=1.0, description="Average OCR confidence score"
)
text_length: int = Field(..., ge=0, description="Total extracted text length")
layout_detected: bool = Field(
..., description="Whether document layout was successfully detected"
)
languages_detected: list[str] = Field(
default_factory=list, description="Detected languages (ISO 639-1 codes)"
)
processing_time_ms: int = Field(
..., ge=0, description="Processing time in milliseconds"
)
storage_path: str = Field(..., description="Path to OCR results in storage")
class DocumentExtractedEventData(BaseEventData):
"""Event emitted when field extraction is complete."""
doc_id: str = Field(..., description="Document identifier")
extraction_id: str = Field(..., description="Unique extraction run identifier")
strategy: Literal["llm", "rules", "hybrid"] = Field(
..., description="Extraction strategy used"
)
fields_extracted: int = Field(..., ge=0, description="Number of fields extracted")
confidence_avg: float = Field(
..., ge=0.0, le=1.0, description="Average extraction confidence"
)
calibrated_confidence: float = Field(
..., ge=0.0, le=1.0, description="Calibrated confidence score"
)
model_name: str | None = Field(None, description="LLM model used (if applicable)")
processing_time_ms: int = Field(
..., ge=0, description="Processing time in milliseconds"
)
storage_path: str = Field(..., description="Path to extraction results")
# Knowledge Graph events
class KGUpsertReadyEventData(BaseEventData):
"""Event emitted when KG upsert data is ready."""
doc_id: str = Field(..., description="Source document identifier")
entity_count: int = Field(..., ge=0, description="Number of entities to upsert")
relationship_count: int = Field(
..., ge=0, description="Number of relationships to upsert"
)
tax_year: str = Field(..., description="Tax year (e.g., '2024-25')")
taxpayer_id: str = Field(..., description="Taxpayer identifier")
normalization_id: str = Field(..., description="Normalization run identifier")
storage_path: str = Field(..., description="Path to normalized data")
class KGUpsertedEventData(BaseEventData):
"""Event emitted when KG upsert is complete."""
doc_id: str = Field(..., description="Source document identifier")
entities_created: int = Field(..., ge=0, description="Entities created")
entities_updated: int = Field(..., ge=0, description="Entities updated")
relationships_created: int = Field(..., ge=0, description="Relationships created")
relationships_updated: int = Field(..., ge=0, description="Relationships updated")
shacl_violations: int = Field(
..., ge=0, description="Number of SHACL validation violations"
)
processing_time_ms: int = Field(
..., ge=0, description="Processing time in milliseconds"
)
success: bool = Field(..., description="Whether upsert was successful")
error_message: str | None = Field(None, description="Error message if failed")
# RAG events
class RAGIndexedEventData(BaseEventData):
"""Event emitted when RAG indexing is complete."""
doc_id: str = Field(..., description="Source document identifier")
collection_name: str = Field(..., description="Qdrant collection name")
chunks_indexed: int = Field(..., ge=0, description="Number of chunks indexed")
embedding_model: str = Field(..., description="Embedding model used")
pii_detected: bool = Field(..., description="Whether PII was detected")
pii_redacted: bool = Field(..., description="Whether PII was redacted")
processing_time_ms: int = Field(
..., ge=0, description="Processing time in milliseconds"
)
storage_path: str = Field(..., description="Path to chunked data")
# Calculation events
class CalculationReadyEventData(BaseEventData):
"""Event emitted when tax calculation is complete."""
taxpayer_id: str = Field(..., description="Taxpayer identifier")
tax_year: str = Field(..., description="Tax year (e.g., '2024-25')")
schedule_id: str = Field(..., description="Tax schedule identifier (SA102, SA103)")
calculation_id: str = Field(..., description="Unique calculation run identifier")
boxes_computed: int = Field(..., ge=0, description="Number of form boxes computed")
total_income: float | None = Field(None, description="Total income calculated")
total_tax: float | None = Field(None, description="Total tax calculated")
confidence: float = Field(
..., ge=0.0, le=1.0, description="Calculation confidence score"
)
evidence_count: int = Field(
..., ge=0, description="Number of evidence items supporting calculation"
)
processing_time_ms: int = Field(
..., ge=0, description="Processing time in milliseconds"
)
storage_path: str = Field(..., description="Path to calculation results")
# Form events
class FormFilledEventData(BaseEventData):
"""Event emitted when PDF form filling is complete."""
taxpayer_id: str = Field(..., description="Taxpayer identifier")
tax_year: str = Field(..., description="Tax year (e.g., '2024-25')")
form_id: str = Field(..., description="Form identifier (SA100, SA102, etc.)")
fields_filled: int = Field(..., ge=0, description="Number of fields filled")
pdf_size_bytes: int = Field(..., ge=0, description="Generated PDF size in bytes")
storage_path: str = Field(..., description="Path to filled PDF")
evidence_bundle_path: str | None = Field(
None, description="Path to evidence bundle ZIP"
)
checksum_sha256: str = Field(..., description="PDF checksum for integrity")
# HMRC events
class HMRCSubmittedEventData(BaseEventData):
"""Event emitted when HMRC submission is complete."""
taxpayer_id: str = Field(..., description="Taxpayer identifier")
tax_year: str = Field(..., description="Tax year (e.g., '2024-25')")
submission_id: str = Field(..., description="Unique submission identifier")
hmrc_reference: str | None = Field(None, description="HMRC submission reference")
submission_type: Literal["dry_run", "sandbox", "live"] = Field(
..., description="Submission environment type"
)
success: bool = Field(..., description="Whether submission was successful")
status_code: int | None = Field(None, description="HTTP status code")
error_message: str | None = Field(None, description="Error message if failed")
processing_time_ms: int = Field(
..., ge=0, description="Processing time in milliseconds"
)
# Review events
class ReviewRequestedEventData(BaseEventData):
"""Event emitted when human review is requested."""
doc_id: str = Field(..., description="Document identifier")
review_type: Literal["extraction", "calculation", "submission"] = Field(
..., description="Type of review needed"
)
priority: Literal["low", "medium", "high", "urgent"] = Field(
..., description="Review priority level"
)
reason: str = Field(..., description="Reason for review request")
assigned_to: str | None = Field(None, description="User assigned to review")
due_date: str | None = Field(None, description="Review due date (ISO 8601)")
metadata: dict[str, Any] = Field(
default_factory=dict, description="Additional review metadata"
)
class ReviewCompletedEventData(BaseEventData):
"""Event emitted when human review is completed."""
doc_id: str = Field(..., description="Document identifier")
review_id: str = Field(..., description="Review session identifier")
reviewer: str = Field(..., description="User who completed review")
decision: Literal["approved", "rejected", "needs_revision"] = Field(
..., description="Review decision"
)
changes_made: int = Field(..., ge=0, description="Number of changes made")
comments: str | None = Field(None, description="Reviewer comments")
review_duration_seconds: int = Field(
..., ge=0, description="Time spent in review (seconds)"
)
# Firm sync events
class FirmSyncCompletedEventData(BaseEventData):
"""Event emitted when firm database sync is complete."""
firm_id: str = Field(..., description="Firm identifier")
connector_type: str = Field(
..., description="Connector type (iris, sage, xero, etc.)"
)
sync_id: str = Field(..., description="Unique sync run identifier")
records_synced: int = Field(..., ge=0, description="Number of records synced")
records_created: int = Field(..., ge=0, description="Records created")
records_updated: int = Field(..., ge=0, description="Records updated")
records_failed: int = Field(..., ge=0, description="Records that failed to sync")
success: bool = Field(..., description="Whether sync was successful")
error_message: str | None = Field(None, description="Error message if failed")
processing_time_ms: int = Field(
..., ge=0, description="Processing time in milliseconds"
)
# Schema mapping for topic -> data class
EVENT_SCHEMA_MAP: dict[str, type[BaseEventData]] = {
"doc.ingested": DocumentIngestedEventData,
"doc.ocr_ready": DocumentOCRReadyEventData,
"doc.extracted": DocumentExtractedEventData,
"kg.upsert.ready": KGUpsertReadyEventData,
"kg.upserted": KGUpsertedEventData,
"rag.indexed": RAGIndexedEventData,
"calc.schedule_ready": CalculationReadyEventData,
"form.filled": FormFilledEventData,
"hmrc.submitted": HMRCSubmittedEventData,
"review.requested": ReviewRequestedEventData,
"review.completed": ReviewCompletedEventData,
"firm.sync.completed": FirmSyncCompletedEventData,
}
def validate_event_data(topic: str, data: dict[str, Any]) -> BaseEventData:
"""
Validate event data against the schema for the given topic.
Args:
topic: Event topic name
data: Raw event data dictionary
Returns:
Validated event data model
Raises:
ValueError: If topic is unknown or validation fails
"""
if topic not in EVENT_SCHEMA_MAP:
raise ValueError(f"Unknown event topic: {topic}")
schema_class = EVENT_SCHEMA_MAP[topic]
return schema_class.model_validate(data)
def get_schema_for_topic(topic: str) -> type[BaseEventData]:
"""
Get the Pydantic schema class for a given topic.
Args:
topic: Event topic name
Returns:
Schema class for the topic
Raises:
ValueError: If topic is unknown
"""
if topic not in EVENT_SCHEMA_MAP:
raise ValueError(f"Unknown event topic: {topic}")
return EVENT_SCHEMA_MAP[topic]

View File

@@ -0,0 +1,338 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"title": "Coverage Policy Schema",
"type": "object",
"required": [
"version",
"jurisdiction",
"tax_year",
"tax_year_boundary",
"defaults",
"document_kinds",
"triggers",
"schedules",
"status_classifier",
"conflict_resolution",
"question_templates"
],
"properties": {
"version": {
"type": "string",
"pattern": "^\\d+\\.\\d+$"
},
"jurisdiction": {
"type": "string",
"enum": ["UK", "US", "CA", "AU"]
},
"tax_year": {
"type": "string",
"pattern": "^\\d{4}-\\d{2}$"
},
"tax_year_boundary": {
"type": "object",
"required": ["start", "end"],
"properties": {
"start": {
"type": "string",
"format": "date"
},
"end": {
"type": "string",
"format": "date"
}
}
},
"defaults": {
"type": "object",
"required": ["confidence_thresholds"],
"properties": {
"confidence_thresholds": {
"type": "object",
"properties": {
"ocr": {
"type": "number",
"minimum": 0,
"maximum": 1
},
"extract": {
"type": "number",
"minimum": 0,
"maximum": 1
}
}
},
"date_tolerance_days": {
"type": "integer",
"minimum": 0
},
"require_lineage_bbox": {
"type": "boolean"
},
"allow_bank_substantiation": {
"type": "boolean"
}
}
},
"document_kinds": {
"type": "array",
"items": {
"type": "string",
"minLength": 1
},
"minItems": 1,
"uniqueItems": true
},
"guidance_refs": {
"type": "object",
"patternProperties": {
"^[A-Z0-9_]+$": {
"type": "object",
"required": ["doc_id", "kind"],
"properties": {
"doc_id": {
"type": "string",
"minLength": 1
},
"kind": {
"type": "string",
"minLength": 1
}
}
}
}
},
"triggers": {
"type": "object",
"patternProperties": {
"^SA\\d+[A-Z]*$": {
"type": "object",
"properties": {
"any_of": {
"type": "array",
"items": {
"type": "string",
"minLength": 1
}
},
"all_of": {
"type": "array",
"items": {
"type": "string",
"minLength": 1
}
}
},
"anyOf": [{ "required": ["any_of"] }, { "required": ["all_of"] }]
}
}
},
"schedules": {
"type": "object",
"patternProperties": {
"^SA\\d+[A-Z]*$": {
"type": "object",
"properties": {
"guidance_hint": {
"type": "string"
},
"evidence": {
"type": "array",
"items": {
"type": "object",
"required": ["id", "role"],
"properties": {
"id": {
"type": "string",
"minLength": 1
},
"role": {
"type": "string",
"enum": ["REQUIRED", "CONDITIONALLY_REQUIRED", "OPTIONAL"]
},
"condition": {
"type": "string"
},
"boxes": {
"type": "array",
"items": {
"type": "string",
"pattern": "^SA\\d+[A-Z]*_b\\d+(_\\d+)?$"
},
"minItems": 0
},
"acceptable_alternatives": {
"type": "array",
"items": {
"type": "string",
"minLength": 1
}
},
"validity": {
"type": "object",
"properties": {
"within_tax_year": {
"type": "boolean"
},
"available_by": {
"type": "string",
"format": "date"
}
}
},
"reasons": {
"type": "object",
"properties": {
"short": {
"type": "string"
}
}
}
}
}
},
"cross_checks": {
"type": "array",
"items": {
"type": "object",
"required": ["name", "logic"],
"properties": {
"name": {
"type": "string",
"minLength": 1
},
"logic": {
"type": "string",
"minLength": 1
}
}
}
},
"selection_rule": {
"type": "object"
},
"notes": {
"type": "object"
}
}
}
}
},
"status_classifier": {
"type": "object",
"required": [
"present_verified",
"present_unverified",
"conflicting",
"missing"
],
"properties": {
"present_verified": {
"$ref": "#/definitions/statusClassifier"
},
"present_unverified": {
"$ref": "#/definitions/statusClassifier"
},
"conflicting": {
"$ref": "#/definitions/statusClassifier"
},
"missing": {
"$ref": "#/definitions/statusClassifier"
}
}
},
"conflict_resolution": {
"type": "object",
"required": ["precedence"],
"properties": {
"precedence": {
"type": "array",
"items": {
"type": "string",
"minLength": 1
},
"minItems": 1
},
"escalation": {
"type": "object"
}
}
},
"question_templates": {
"type": "object",
"required": ["default"],
"properties": {
"default": {
"type": "object",
"required": ["text", "why"],
"properties": {
"text": {
"type": "string",
"minLength": 1
},
"why": {
"type": "string",
"minLength": 1
}
}
},
"reasons": {
"type": "object",
"patternProperties": {
"^[A-Za-z0-9_]+$": {
"type": "string",
"minLength": 1
}
}
}
}
},
"privacy": {
"type": "object",
"properties": {
"vector_pii_free": {
"type": "boolean"
},
"redact_patterns": {
"type": "array",
"items": {
"type": "string",
"minLength": 1
}
}
}
}
},
"definitions": {
"statusClassifier": {
"type": "object",
"properties": {
"min_ocr": {
"type": "number",
"minimum": 0,
"maximum": 1
},
"min_extract": {
"type": "number",
"minimum": 0,
"maximum": 1
},
"date_in_year": {
"type": "boolean"
},
"date_in_year_or_tolerance": {
"type": "boolean"
},
"conflict_rules": {
"type": "array",
"items": {
"type": "string",
"minLength": 1
}
},
"default": {
"type": "boolean"
}
}
}
}
}

202
schemas/kg_schema.json Normal file
View File

@@ -0,0 +1,202 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"title": "Tax Knowledge Graph Schema",
"definitions": {
"temporal_properties": {
"type": "object",
"properties": {
"valid_from": { "type": "string", "format": "date-time" },
"valid_to": { "type": "string", "format": "date-time" },
"asserted_at": { "type": "string", "format": "date-time" },
"retracted_at": { "type": ["string", "null"], "format": "date-time" },
"source": { "type": "string" },
"extractor_version": { "type": "string" }
},
"required": ["valid_from", "asserted_at", "source", "extractor_version"]
},
"provenance": {
"type": "object",
"properties": {
"doc_id": { "type": "string" },
"page": { "type": "integer", "minimum": 1 },
"bbox": {
"type": "object",
"properties": {
"x": { "type": "number" },
"y": { "type": "number" },
"width": { "type": "number" },
"height": { "type": "number" }
},
"required": ["x", "y", "width", "height"]
},
"text_hash": { "type": "string" },
"ocr_confidence": { "type": "number", "minimum": 0, "maximum": 1 }
},
"required": ["doc_id", "page", "text_hash"]
}
},
"oneOf": [
{
"title": "TaxpayerProfile",
"type": "object",
"properties": {
"node_type": { "const": "TaxpayerProfile" },
"taxpayer_id": { "type": "string" },
"type": { "enum": ["Individual", "Partnership", "Company"] },
"residence": { "type": "string" },
"contact": {
"type": "object",
"properties": {
"email": { "type": "string", "format": "email" },
"phone": { "type": "string" },
"address": { "type": "string" }
}
},
"tax_years": { "type": "array", "items": { "type": "string" } },
"utr": { "type": "string", "pattern": "^[0-9]{10}$" },
"ni_number": {
"type": "string",
"pattern": "^[A-CEGHJ-PR-TW-Z]{2}\\d{6}[A-D]$"
}
},
"allOf": [{ "$ref": "#/definitions/temporal_properties" }],
"required": ["node_type", "taxpayer_id", "type"]
},
{
"title": "TaxYear",
"type": "object",
"properties": {
"node_type": { "const": "TaxYear" },
"label": { "type": "string" },
"start_date": { "type": "string", "format": "date" },
"end_date": { "type": "string", "format": "date" },
"jurisdiction_ref": { "type": "string" }
},
"allOf": [{ "$ref": "#/definitions/temporal_properties" }],
"required": [
"node_type",
"label",
"start_date",
"end_date",
"jurisdiction_ref"
]
},
{
"title": "Document",
"type": "object",
"properties": {
"node_type": { "const": "Document" },
"doc_id": { "type": "string" },
"kind": {
"enum": [
"bank_statement",
"invoice",
"receipt",
"p_and_l",
"balance_sheet",
"payslip",
"dividend_voucher",
"property_statement",
"prior_return",
"letter",
"certificate"
]
},
"source": { "type": "string" },
"mime": { "type": "string" },
"date_range": {
"type": "object",
"properties": {
"start": { "type": "string", "format": "date" },
"end": { "type": "string", "format": "date" }
}
},
"checksum": { "type": "string" },
"file_size": { "type": "integer" },
"pages": { "type": "integer", "minimum": 1 }
},
"allOf": [{ "$ref": "#/definitions/temporal_properties" }],
"required": ["node_type", "doc_id", "kind", "source", "checksum"]
},
{
"title": "Evidence",
"type": "object",
"properties": {
"node_type": { "const": "Evidence" },
"snippet_id": { "type": "string" },
"doc_ref": { "type": "string" },
"page": { "type": "integer", "minimum": 1 },
"bbox": {
"type": "object",
"properties": {
"x": { "type": "number" },
"y": { "type": "number" },
"width": { "type": "number" },
"height": { "type": "number" }
},
"required": ["x", "y", "width", "height"]
},
"text_hash": { "type": "string" },
"ocr_confidence": { "type": "number", "minimum": 0, "maximum": 1 },
"extracted_text": { "type": "string" }
},
"allOf": [{ "$ref": "#/definitions/temporal_properties" }],
"required": [
"node_type",
"snippet_id",
"doc_ref",
"page",
"bbox",
"text_hash"
]
},
{
"title": "IncomeItem",
"type": "object",
"properties": {
"node_type": { "const": "IncomeItem" },
"type": {
"enum": [
"employment",
"self_employment",
"property",
"dividend",
"interest",
"other"
]
},
"gross": { "type": "number" },
"net": { "type": "number" },
"tax_withheld": { "type": "number" },
"period_start": { "type": "string", "format": "date" },
"period_end": { "type": "string", "format": "date" },
"currency": { "type": "string", "pattern": "^[A-Z]{3}$" },
"description": { "type": "string" }
},
"allOf": [
{ "$ref": "#/definitions/temporal_properties" },
{ "$ref": "#/definitions/provenance" }
],
"required": ["node_type", "type", "gross", "currency"]
},
{
"title": "ExpenseItem",
"type": "object",
"properties": {
"node_type": { "const": "ExpenseItem" },
"type": { "enum": ["business", "property", "capital", "personal"] },
"amount": { "type": "number" },
"category": { "type": "string" },
"capitalizable_flag": { "type": "boolean" },
"currency": { "type": "string", "pattern": "^[A-Z]{3}$" },
"description": { "type": "string" },
"allowable": { "type": "boolean" }
},
"allOf": [
{ "$ref": "#/definitions/temporal_properties" },
{ "$ref": "#/definitions/provenance" }
],
"required": ["node_type", "type", "amount", "currency"]
}
]
}

View File

@@ -1,475 +1,105 @@
# ROLE
You are a **Solution Architect + Ontologist + Data Engineer + Platform/SRE** delivering a **production-grade accounting knowledge system** that ingests documents, fuses a **Knowledge Graph (KG)** with a **Vector DB (Qdrant)** for RAG, integrates with **Firm Databases**, and powers **AI agents** to complete workflows like **UK Self Assessment** with **auditable provenance**.
**Authentication & authorization are centralized at the edge:** **Traefik** gateway + **Authentik** SSO (OIDC/ForwardAuth). **Backend services trust Traefik** on an internal network and consume user/role claims from forwarded headers/JWT.
# OBJECTIVE
Deliver a complete, implementable solutionontology, extraction pipeline, RAG+KG retrieval, deterministic calculators, APIs, validations, **architecture & stack**, infra-as-code, CI/CD, observability, security/governance, test plan, and a worked exampleso agents can:
1. read documents (and scrape portals via RPA),
2. populate/maintain a compliant accounting/tax KG,
3. retrieve firm knowledge via RAG (vector + keyword + graph),
4. compute/validate schedules and fill forms,
5. submit (stub/sandbox/live),
6. justify every output with **traceable provenance** (doc/page/bbox) and citations.
# SCOPE & VARIABLES
- **Jurisdiction:** {{jurisdiction}} (default: UK)
- **Tax regime / forms:** {{forms}} (default: SA100 + SA102, SA103, SA105, SA110; optional SA108)
- **Accounting basis:** {{standards}} (default: UK GAAP; support IFRS/XBRL mapping)
- **Document types:** bank statements, invoices, receipts, P\&L, balance sheet, payslips, dividend vouchers, property statements, prior returns, letters, certificates.
- **Primary stores:** KG = Neo4j; RAG = Qdrant; Objects = MinIO; Secrets = Vault; IdP/SSO = Authentik; **API Gateway = Traefik**.
- **PII constraints:** GDPR/UK-GDPR; **no raw PII in vector DB** (de-identify before indexing); role-based access; encryption; retention; right-to-erasure.
---
# ARCHITECTURE & STACK (LOCAL-FIRST; SCALE-OUT READY)
## Edge & Identity (centralized)
- **Traefik** (reverse proxy & ingress) terminates TLS, does **AuthN/AuthZ via Authentik**:
- Use **Authentik Outpost (ForwardAuth)** middleware in Traefik.
- Traefik injects verified headers/JWT to upstream services: `X-Authenticated-User`, `X-Authenticated-Email`, `X-Authenticated-Groups`, `Authorization: Bearer <jwt>`.
- **Per-route RBAC** via Traefik middlewares (group/claim checks); services only enforce **fine-grained, app-level authorization** using forwarded claims (no OIDC in each service).
- All services are **private** (only reachable behind Traefik on an internal Docker/K8s network). Direct access is denied.
## Services (independent deployables; Python 3.12 unless stated)
1. **svc-ingestion** uploads/URLs; checksum; MinIO write; emits `doc.ingested`.
2. **svc-rpa** Playwright RPA for firm/client portals; Prefect-scheduled; emits `doc.ingested`.
3. **svc-ocr** Tesseract (local) or Textract (scale); de-skew/rotation/layout; emits `doc.ocr_ready`.
4. **svc-extract** LLM + rules + table detectors **schema-constrained JSON** (kv + tables + bbox/page); emits `doc.extracted`.
5. **svc-normalize-map** normalize currency/dates; entity resolution; assign tax year; map to KG nodes/edges with **Evidence** anchors; emits `kg.upserted`.
6. **svc-kg** Neo4j DDL + **SHACL** validation; **bitemporal** writes `{valid_from, valid_to, asserted_at}`; RDF export.
7. **svc-rag-indexer** chunk/de-identify/embed; upsert **Qdrant** collections (firm knowledge, legislation, best practices, glossary).
8. **svc-rag-retriever** **hybrid retrieval** (dense + sparse) + rerank + **KG-fusion**; returns chunks + citations + KG join hints.
9. **svc-reason** deterministic calculators (employment, self-employment, property, dividends/interest, allowances, NIC, HICBC, student loans); Cypher materializers; explanations.
10. **svc-forms** fill PDFs; ZIP evidence bundle (signed manifest).
11. **svc-hmrc** submit stub|sandbox|live; rate-limit & retries; submission audit.
12. **svc-firm-connectors** read-only connectors to Firm Databases; sync to **Secure Client Data Store** with lineage.
13. **ui-review** Next.js reviewer portal (SSO via Traefik+Authentik); reviewers accept/override extractions.
## Orchestration & Messaging
- **Prefect 2.x** for local orchestration; **Temporal** for production scale (sagas, retries, idempotency).
- Events: Kafka (or SQS/SNS) `doc.ingested`, `doc.ocr_ready`, `doc.extracted`, `kg.upserted`, `rag.indexed`, `calc.schedule_ready`, `form.filled`, `hmrc.submitted`, `review.requested`, `review.completed`, `firm.sync.completed`.
## Concrete Stack (pin/assume unless replaced)
- **Languages:** Python **3.12**, TypeScript 5/Node 20
- **Frameworks:** FastAPI, Pydantic v2, SQLAlchemy 2 (ledger), Prefect 2.x (local), Temporal (scale)
- **Gateway:** **Traefik** 3.x with **Authentik Outpost** (ForwardAuth)
- **Identity/SSO:** **Authentik** (OIDC/OAuth2)
- **Secrets:** **Vault** (AppRole/JWT; Transit for envelope encryption)
- **Object Storage:** **MinIO** (S3 API)
- **Vector DB:** **Qdrant** 1.x (dense + sparse hybrid)
- **Embeddings/Rerankers (local-first):**
Dense: `bge-m3` or `bge-small-en-v1.5`; Sparse: BM25/SPLADE (Qdrant sparse); Reranker: `cross-encoder/ms-marco-MiniLM-L-6-v2`
- **Datastores:**
- **Secure Client Data Store:** PostgreSQL 15 (encrypted; RLS; pgcrypto)
- **KG:** Neo4j 5.x
- **Cache/locks:** Redis
- **Infra:** **Docker-Compose** for local; **Kubernetes** for scale (Helm, ArgoCD optional later)
- **CI/CD:** **Gitea** + Gitea Actions (or Drone) container registry deploy
## Data Layer (three pillars + fusion)
1. **Firm Databases** **Firm Connectors** (read-only) **Secure Client Data Store (Postgres)** with lineage.
2. **Vector DB / Knowledge Base (Qdrant)** internal knowledge, legislation, best practices, glossary; **no PII** (placeholders + hashes).
3. **Knowledge Graph (Neo4j)** accounting/tax ontology with evidence anchors and rules/calculations.
**Fusion strategy:** Query RAG retrieve (Qdrant) + KG traverse **fusion** scoring (α·dense + β·sparse + γ·KG-link-boost) results with citations (URL/doc_id+page/anchor) and graph paths.
## Non-functional Targets
- SLOs: ingestextract p95 3m; reconciliation 98%; lineage coverage 99%; schedule error 1/1k
- Throughput: local 2 docs/s; scale 5 docs/s sustained; burst 20 docs/s
- Idempotency: `sha256(doc_checksum + extractor_version)`
- Retention: raw images 7y; derived text 2y; vectors (non-PII) 7y; PII-min logs 90d
- Erasure: per `client_id` across MinIO, KG, Qdrant (payload filter), Postgres rows
---
# REPOSITORY LAYOUT (monorepo, local-first)
```
repo/
apps/
svc-ingestion/ svc-rpa/ svc-ocr/ svc-extract/
svc-normalize-map/ svc-kg/ svc-rag-indexer/ svc-rag-retriever/
svc-reason/ svc-forms/ svc-hmrc/ svc-firm-connectors/
ui-review/
kg/
ONTOLOGY.md
schemas/{nodes_and_edges.schema.json, context.jsonld, shapes.ttl}
db/{neo4j_schema.cypher, seed.cypher}
reasoning/schedule_queries.cypher
retrieval/
chunking.yaml qdrant_collections.json indexer.py retriever.py fusion.py
config/{heuristics.yaml, mapping.json}
prompts/{doc_classify.txt, kv_extract.txt, table_extract.txt, entity_link.txt, rag_answer.txt}
pipeline/etl.py
infra/
compose/{docker-compose.local.yml, traefik.yml, traefik-dynamic.yml, env.example}
k8s/ (optional later: Helm charts)
security/{dpia.md, ropa.md, retention_policy.md, threat_model.md}
ops/
runbooks/{ingest.md, calculators.md, hmrc.md, vector-indexing.md, dr-restore.md}
dashboards/grafana.json
alerts/prometheus-rules.yaml
tests/{unit, integration, e2e, data/{synthetic, golden}}
Makefile
.gitea/workflows/ci.yml
mkdocs.yml
```
---
# DELIVERABLES (RETURN ALL AS MARKED CODE BLOCKS)
1. **Ontology** (Concept model; JSON-Schema; JSON-LD; Neo4j DDL)
2. **Heuristics & Rules (YAML)**
3. **Extraction pipeline & prompts**
4. **RAG & Retrieval Layer** (chunking, Qdrant collections, indexer, retriever, fusion)
5. **Reasoning layer** (deterministic calculators + Cypher + tests)
6. **Agent interface (Tooling API)**
7. **Quality & Safety** (datasets, metrics, tests, red-team)
8. **Graph Constraints** (SHACL, IDs, bitemporal)
9. **Security & Compliance** (DPIA, ROPA, encryption, auditability)
10. **Worked Example** (end-to-end UK SA sample)
11. **Observability & SRE** (SLIs/SLOs, tracing, idempotency, DR, cost controls)
12. **Architecture & Local Infra** (**docker-compose** with Traefik + Authentik + Vault + MinIO + Qdrant + Neo4j + Postgres + Redis + Prometheus/Grafana + Loki + Unleash + services)
13. **Repo Scaffolding & Makefile** (dev tasks, lint, test, build, run)
14. **Firm Database Connectors** (data contracts, sync jobs, lineage)
15. **Traefik & Authentik configs** (static+dynamic, ForwardAuth, route labels)
---
# ONTOLOGY REQUIREMENTS (as before + RAG links)
- Nodes: `TaxpayerProfile`, `TaxYear`, `Jurisdiction`, `TaxForm`, `Schedule`, `FormBox`, `Document`, `Evidence`, `Party`, `Account`, `IncomeItem`, `ExpenseItem`, `PropertyAsset`, `BusinessActivity`, `Allowance`, `Relief`, `PensionContribution`, `StudentLoanPlan`, `Payment`, `ExchangeRate`, `Calculation`, `Rule`, `NormalizationEvent`, `Reconciliation`, `Consent`, `LegalBasis`, `ImportJob`, `ETLRun`
- Relationships: `BELONGS_TO`, `OF_TAX_YEAR`, `IN_JURISDICTION`, `HAS_SECTION`, `HAS_BOX`, `REPORTED_IN`, `COMPUTES`, `DERIVED_FROM`, `SUPPORTED_BY`, `PAID_BY`, `PAID_TO`, `OWNS`, `RENTED_BY`, `EMPLOYED_BY`, `APPLIES_TO`, `APPLIES`, `VIOLATES`, `NORMALIZED_FROM`, `HAS_VALID_BASIS`, `PRODUCED_BY`, **`CITES`**, **`DESCRIBES`**
- **Bitemporal** and **provenance** mandatory.
---
# UK-SPECIFIC REQUIREMENTS
- Year boundary 6 Apr5 Apr; basis period reform toggle
- Employment aggregation, BIK, PAYE offsets
- Self-employment: allowable/disallowable, capital allowances (AIA/WDA/SBA), loss rules, **NIC Class 2 & 4**
- Property: FHL tests, **mortgage interest 20% credit**, Rent-a-Room, joint splits
- Savings/dividends: allowances & rate bands; ordering
- Personal allowance tapering; Gift Aid & pension gross-up; **HICBC**; **Student Loan** plans 1/2/4/5 & PGL
- Rounding per `FormBox.rounding_rule`
---
# YAML HEURISTICS (KEEP SEPARATE FILE)
- document_kinds, field_normalization, line_item_mapping
- period_inference (UK boundary + reform), dedupe_rules
- **validation_rules:** `utr_checksum`, `ni_number_regex`, `iban_check`, `vat_gb_mod97`, `rounding_policy: "HMRC"`, `numeric_tolerance: 0.01`
- **entity_resolution:** blocking keys, fuzzy thresholds, canonical source priority
- **privacy_redaction:** `mask_except_last4` for NI/UTR/IBAN/sort_code/phone/email
- **jurisdiction_overrides:** by {{jurisdiction}} and {{tax\_year}}
---
# EXTRACTION PIPELINE (SPECIFY CODE & PROMPTS)
- ingest classify OCR/layout extract (schema-constrained JSON with bbox/page) validate normalize map_to_graph post-checks
- Prompts: `doc_classify`, `kv_extract`, `table_extract` (multi-page), `entity_link`
- Contract: **JSON schema enforcement** with retry/validator loop; temperature guidance
- Reliability: de-skew/rotation/language/handwriting policy
- Mapping config: JSON mapping to nodes/edges + provenance (doc_id/page/bbox/text_hash)
---
# RAG & RETRIEVAL LAYER (Qdrant + KG Fusion)
- Collections: `firm_knowledge`, `legislation`, `best_practices`, `glossary` (payloads include jurisdiction, tax_years, topic_tags, version, `pii_free:true`)
- Chunking: layout-aware; tables serialized; \~1.5k token chunks, 1015% overlap
- Indexer: de-identify PII; placeholders only; embeddings (dense) + sparse; upsert with payload
- Retriever: hybrid scoring (α·dense + β·sparse), filters (jurisdiction/tax_year), rerank; return **citations** + **KG hints**
- Fusion: boost results linked to applicable `Rule`/`Calculation`/`Evidence` for current schedule
- Right-to-erasure: purge vectors via payload filter (`client_id?` only for client-authored knowledge)
---
# REASONING & CALCULATION (DETERMINISTIC)
- Order: incomes allowances/capital allowances loss offsets personal allowance savings/dividend bands HICBC & student loans NIC Class 2/4 property 20% credit/FHL/Rent-a-Room
- Cypher materializers per schedule/box; explanations via `DERIVED_FROM` and RAG `CITES`
- Unit tests per rule; golden files; property-based tests
---
# AGENT TOOLING API (JSON SCHEMAS)
1. `ComputeSchedule({tax_year, taxpayer_id, schedule_id}) -> {boxes[], totals[], explanations[]}`
2. `PopulateFormBoxes({tax_year, taxpayer_id, form_id}) -> {fields[], pdf_fields[], confidence, calibrated_confidence}`
3. `AskClarifyingQuestion({gap, candidate_values, evidence}) -> {question_text, missing_docs}`
4. `GenerateEvidencePack({scope}) -> {bundle_manifest, signed_hashes}`
5. `ExplainLineage({node_id|field}) -> {chain:[evidence], graph_paths}`
6. `CheckDocumentCoverage({tax_year, taxpayer_id}) -> {required_docs[], missing[], blockers[]}`
7. `SubmitToHMRC({tax_year, taxpayer_id, dry_run}) -> {status, submission_id?, errors[]}`
8. `ReconcileBank({account_id, period}) -> {unmatched_invoices[], unmatched_bank_lines[], deltas}`
9. `RAGSearch({query, tax_year?, jurisdiction?, k?}) -> {chunks[], citations[], kg_hints[], calibrated_confidence}`
10. `SyncFirmDatabases({since}) -> {objects_synced, errors[]}`
**Env flags:** `HMRC_MTD_ITSA_MODE`, `RATE_LIMITS`, `RAG_EMBEDDING_MODEL`, `RAG_RERANKER_MODEL`, `RAG_ALPHA_BETA_GAMMA`
---
# SECURITY & COMPLIANCE
- **Traefik + Authentik SSO at edge** (ForwardAuth); per-route RBAC; inject verified claims headers/JWT
- **Vault** for secrets (AppRole/JWT, Transit for envelope encryption)
- **PII minimization:** no PII in Qdrant; placeholders; PII mapping only in Secure Client Data Store
- **Auditability:** tamper-evident logs (hash chain), signer identity, time sync
- **DPIA, ROPA, retention policy, right-to-erasure** workflows
---
# CI/CD (Gitea)
- Gitea Actions: `lint` (ruff/mypy/eslint), `test` (pytest+coverage, e2e), `build` (Docker), `scan` (Trivy/SAST), `push` (registry), `deploy` (compose up or K8s apply)
- SemVer tags; SBOM (Syft); OpenAPI + MkDocs publish; pre-commit hooks
---
# OBSERVABILITY & SRE
- SLIs/SLOs: ingest_time_p50, extract_precision\@field0.97, reconciliation_pass_rate0.98, lineage_coverage0.99, time_to_review_p95
- Dashboards: ingestion throughput, OCR error rates, extraction precision, mapping latency, calculator failures, HMRC submits, **RAG recall/precision & faithfulness**
- Alerts: OCR 5xx spike, extraction precision dip, reconciliation failures, HMRC rate-limit breaches, RAG drift
- Backups/DR: Neo4j dump (daily), Postgres PITR, Qdrant snapshot, MinIO versioning; quarterly restore test
- Cost controls: embedding cache, incremental indexing, compaction/TTL for stale vectors, cold archive for images
---
# OUTPUT FORMAT (STRICT)
Return results in the following order, each in its own fenced code block **with the exact language tag**:
```md
<!-- FILE: ONTOLOGY.md -->
# Concept Model
...
```
```json
// FILE: schemas/nodes_and_edges.schema.json
{ ... }
```
```json
// FILE: schemas/context.jsonld
{ ... }
```
```turtle
# FILE: schemas/shapes.ttl
# SHACL shapes for node/edge integrity
...
```
```cypher
// FILE: db/neo4j_schema.cypher
CREATE CONSTRAINT ...
```
```yaml
# FILE: config/heuristics.yaml
document_kinds: ...
```
```json
# FILE: config/mapping.json
{ "mappings": [ ... ] }
```
```yaml
# FILE: retrieval/chunking.yaml
# Layout-aware chunking, tables, overlap, token targets
```
```json
# FILE: retrieval/qdrant_collections.json
{
"collections": [
{ "name": "firm_knowledge", "dense": {"size": 1024}, "sparse": true, "payload_schema": { ... } },
{ "name": "legislation", "dense": {"size": 1024}, "sparse": true, "payload_schema": { ... } },
{ "name": "best_practices", "dense": {"size": 1024}, "sparse": true, "payload_schema": { ... } },
{ "name": "glossary", "dense": {"size": 768}, "sparse": true, "payload_schema": { ... } }
]
"$schema": "http://json-schema.org/draft-07/schema#",
"title": "Tax Agent Knowledge Graph Schema",
"description": "Schema for nodes and relationships in the AI Tax Agent knowledge graph",
"type": "object",
"properties": {
"nodes": {
"type": "array",
"items": {
"type": "object",
"properties": {
"id": { "type": "string", "description": "Unique identifier for the node" },
"type": {
"type": "string",
"description": "Type of the node (e.g., TaxpayerProfile, IncomeItem)",
"enum": [
"TaxpayerProfile",
"TaxYear",
"Jurisdiction",
"TaxForm",
"Schedule",
"FormBox",
"Document",
"Evidence",
"Party",
"Account",
"IncomeItem",
"ExpenseItem",
"PropertyAsset",
"BusinessActivity",
"Allowance",
"Relief",
"PensionContribution",
"StudentLoanPlan",
"Payment",
"ExchangeRate",
"Calculation",
"Rule",
"NormalizationEvent",
"Reconciliation",
"Consent",
"LegalBasis",
"ImportJob",
"ETLRun"
]
},
"properties": {
"type": "object",
"description": "Key-value properties of the node",
"additionalProperties": true
}
},
"required": ["id", "type", "properties"],
"additionalProperties": false
}
},
"relationships": {
"type": "array",
"items": {
"type": "object",
"properties": {
"id": { "type": "string", "description": "Unique identifier for the relationship" },
"type": {
"type": "string",
"description": "Type of the relationship (e.g., BELONGS_TO, HAS_BOX)",
"enum": [
"BELONGS_TO",
"OF_TAX_YEAR",
"IN_JURISDICTION",
"HAS_SECTION",
"HAS_BOX",
"REPORTED_IN",
"COMPUTES",
"DERIVED_FROM",
"SUPPORTED_BY",
"PAID_BY",
"PAID_TO",
"OWNS",
"RENTED_BY",
"EMPLOYED_BY",
"APPLIES_TO",
"APPLIES",
"VIOLATES",
"NORMALIZED_FROM",
"HAS_VALID_BASIS",
"PRODUCED_BY",
"CITES",
"DESCRIBES"
]
},
"sourceId": { "type": "string", "description": "ID of the source node" },
"targetId": { "type": "string", "description": "ID of the target node" },
"properties": {
"type": "object",
"description": "Key-value properties of the relationship",
"additionalProperties": true
}
},
"required": ["id", "type", "sourceId", "targetId"],
"additionalProperties": false
}
}
},
"required": ["nodes", "relationships"]
}
```
```python
# FILE: retrieval/indexer.py
# De-identify -> embed dense/sparse -> upsert to Qdrant with payload
...
```
```python
# FILE: retrieval/retriever.py
# Hybrid retrieval (alpha,beta), rerank, filters, return citations + KG hints
...
```
```python
# FILE: retrieval/fusion.py
# Join RAG chunks to KG rules/calculations/evidence; boost linked results
...
```
```txt
# FILE: prompts/rag_answer.txt
[Instruction: cite every claim; forbid PII; return calibrated_confidence; JSON contract]
```
```python
# FILE: pipeline/etl.py
def ingest(...): ...
```
```txt
# FILE: prompts/kv_extract.txt
[Prompt with JSON contract + examples]
```
```cypher
// FILE: reasoning/schedule_queries.cypher
// SA105: compute property income totals
MATCH ...
```
```json
// FILE: tools/agent_tools.json
{ ... }
```
```yaml
# FILE: infra/compose/docker-compose.local.yml
# Traefik (with Authentik ForwardAuth), Authentik, Vault, MinIO, Qdrant, Neo4j, Postgres, Redis, Prometheus/Grafana, Loki, Unleash, all services
```
```yaml
# FILE: infra/compose/traefik.yml
# Static config: entryPoints, providers, certificates, access logs
entryPoints:
web:
address: ":80"
websecure:
address: ":443"
providers:
docker: {}
file:
filename: /etc/traefik/traefik-dynamic.yml
api:
dashboard: true
log:
level: INFO
accessLog: {}
```
```yaml
# FILE: infra/compose/traefik-dynamic.yml
# Dynamic config: Authentik ForwardAuth middleware + routers per service
http:
middlewares:
authentik-forwardauth:
forwardAuth:
address: "http://authentik-outpost:9000/outpost.goauthentik.io/auth/traefik"
trustForwardHeader: true
authResponseHeaders:
- X-Authenticated-User
- X-Authenticated-Email
- X-Authenticated-Groups
- Authorization
rate-limit:
rateLimit:
average: 50
burst: 100
routers:
svc-extract:
rule: "Host(`api.local`) && PathPrefix(`/extract`)"
entryPoints: ["websecure"]
service: svc-extract
middlewares: ["authentik-forwardauth", "rate-limit"]
tls: {}
services:
svc-extract:
loadBalancer:
servers:
- url: "http://svc-extract:8000"
```
```yaml
# FILE: infra/compose/env.example
DOMAIN=local
EMAIL=admin@local
MINIO_ROOT_USER=minio
MINIO_ROOT_PASSWORD=miniopass
POSTGRES_PASSWORD=postgres
NEO4J_PASSWORD=neo4jpass
QDRANT__SERVICE__GRPC_PORT=6334
VAULT_DEV_ROOT_TOKEN_ID=root
AUTHENTIK_SECRET_KEY=changeme
RAG_EMBEDDING_MODEL=bge-small-en-v1.5
RAG_RERANKER_MODEL=cross-encoder/ms-marco-MiniLM-L-6-v2
```
```yaml
# FILE: .gitea/workflows/ci.yml
# Lint Test Build Scan Push Deploy (compose up)
```
```makefile
# FILE: Makefile
# bootstrap, run, test, lint, build, deploy, format, seed
...
```
```md
<!-- FILE: TESTPLAN.md -->
## Datasets, Metrics, Acceptance Criteria
- Extraction precision/recall per field
- Schedule-level absolute error
- Reconciliation pass-rate
- Explanation coverage
- RAG retrieval: top-k recall, nDCG, faithfulness, groundedness
- Security: Traefik+Authentik route auth tests, header spoofing prevention (internal network, trusted proxy)
- Red-team cases (OCR noise, conflicting docs, PII leak prevention)
...
```
---
# STYLE & GUARANTEES
- Be **concise but complete**; prefer schemas/code over prose.
- **No chain-of-thought.** Provide final artifacts and brief rationales.
- Every numeric output must include **lineage to Evidence Document (page/bbox/text_hash)** and **citations** for narrative answers.
- Parameterize by {{jurisdiction}} and {{tax\_year}}.
- Include **calibrated_confidence** and name calibration method.
- Enforce **SHACL** on KG writes; reject/queue fixes on violation.
- **No PII** in Qdrant. Use de-ID placeholders; keep mappings only in Secure Client Data Store.
- Deterministic IDs; reproducible builds; version-pinned dependencies.
- **Trust boundary:** only Traefik exposes ports; all services on a private network; services accept only requests with Traefiks network identity; **never trust client-supplied auth headers**.
# START
Produce the deliverables now, in the exact order and file/block structure above, implementing the **local-first stack (Python 3.12, Prefect, Vault, MinIO, Playwright, Qdrant, Authentik, Traefik, Docker-Compose, Gitea)** with optional **scale-out** notes (Temporal, K8s) where specified.

View File

@@ -168,7 +168,7 @@ main() {
# Check if setup is complete
if ! check_setup_complete; then
echo -e "${YELLOW}⚠️ Initial setup is still required${NC}"
echo -e "${BLUE}📋 Please complete setup at: https://auth.local/if/flow/initial-setup/${NC}"
echo -e "${BLUE}📋 Please complete setup at: https://auth.local.lan.lan/if/flow/initial-setup/${NC}"
echo -e "${BLUE}Use credentials: admin@local.local / admin123${NC}"
return 1
fi

View File

@@ -134,13 +134,13 @@ main() {
else
echo -e "${YELLOW}⚠️ Could not get API token automatically${NC}"
echo -e "${BLUE}📋 Manual steps:${NC}"
echo -e " 1. Open ${BLUE}https://auth.local${NC} and log in"
echo -e " 1. Open ${BLUE}https://auth.local.lan${NC} and log in"
echo -e " 2. Go to Admin Interface > Tokens"
echo -e " 3. Create a new token and update AUTHENTIK_BOOTSTRAP_TOKEN in .env"
fi
else
echo -e "${YELLOW}📋 Initial setup still required:${NC}"
echo -e " 1. Open ${BLUE}https://auth.local/if/flow/initial-setup/${NC}"
echo -e " 1. Open ${BLUE}https://auth.local.lan.lan/if/flow/initial-setup/${NC}"
echo -e " 2. Complete the setup wizard with these credentials:"
echo -e " • Email: ${BLUE}$ADMIN_EMAIL${NC}"
echo -e " • Password: ${BLUE}$ADMIN_PASSWORD${NC}"

View File

@@ -13,7 +13,7 @@ NC='\033[0m' # No Color
# Configuration
DOMAIN=${DOMAIN:-local}
AUTHENTIK_URL="https://auth.${DOMAIN}"
ADMIN_EMAIL="admin@local"
ADMIN_EMAIL="admin@local.lan"
ADMIN_PASSWORD="${AUTHENTIK_ADMIN_PASSWORD:-admin123}"
echo -e "${BLUE}🤖 Automatically completing Authentik initial setup...${NC}"
@@ -110,7 +110,7 @@ main() {
else
echo -e "${RED}❌ Automatic setup failed${NC}"
echo -e "${YELLOW}📋 Manual setup required:${NC}"
echo -e " 1. Open ${BLUE}https://auth.local/if/flow/initial-setup/${NC}"
echo -e " 1. Open ${BLUE}https://auth.local.lan.lan/if/flow/initial-setup/${NC}"
echo -e " 2. Use credentials: ${BLUE}$ADMIN_EMAIL${NC} / ${BLUE}$ADMIN_PASSWORD${NC}"
fi
else

View File

@@ -11,9 +11,14 @@ BLUE='\033[0;34m'
NC='\033[0m' # No Color
# Configuration
# Load environment variables
if [ -f "infra/compose/.env" ]; then
source "infra/compose/.env"
fi
DOMAIN=${DOMAIN:-local}
AUTHENTIK_URL="https://auth.${DOMAIN}"
ADMIN_EMAIL="admin@local"
ADMIN_EMAIL="admin@${DOMAIN}"
ADMIN_PASSWORD="${AUTHENTIK_ADMIN_PASSWORD:-admin123}"
ENV_FILE="infra/compose/.env"
@@ -116,6 +121,12 @@ get_api_token() {
# Main function
main() {
# Check if we already have a valid token (not the placeholder)
if [ -n "${AUTHENTIK_BOOTSTRAP_TOKEN:-}" ] && [ "$AUTHENTIK_BOOTSTRAP_TOKEN" != "ak-bootstrap-token" ]; then
echo -e "${GREEN}✅ Bootstrap token already configured in .env${NC}"
return 0
fi
# Check if setup is already complete
if check_setup_status; then
echo -e "${GREEN}✅ Authentik setup is already complete${NC}"
@@ -132,15 +143,23 @@ main() {
echo -e "${GREEN}🎉 Setup complete! You can now run:${NC}"
echo -e " ${BLUE}make setup-authentik${NC} - to import blueprint configuration"
else
echo -e "${YELLOW}⚠️ Could not get API token automatically${NC}"
echo -e "${BLUE}📋 Manual steps:${NC}"
echo -e " 1. Open ${BLUE}https://auth.local${NC} and log in"
echo -e " 2. Go to Admin Interface > Tokens"
echo -e " 3. Create a new token and update AUTHENTIK_BOOTSTRAP_TOKEN in .env"
echo -e "${YELLOW}⚠️ Could not get API token automatically.${NC}"
echo -e " (This is expected if you changed the admin password during setup)"
echo
echo -e "${BLUE}📋 ACTION REQUIRED: Manual Configuration${NC}"
echo -e " 1. Open ${BLUE}https://auth.${DOMAIN}/if/admin/#/core/tokens${NC} and log in"
echo -e " 2. Click 'Create'"
echo -e " - Identifier: ${YELLOW}ai-tax-agent-bootstrap${NC}"
echo -e " - User: ${YELLOW}akadmin${NC}"
echo -e " 3. Copy the ${YELLOW}Key${NC} (it's a long string)"
echo -e " 4. Open ${YELLOW}infra/environments/local/.env${NC} in your editor"
echo -e " 5. Replace ${YELLOW}AUTHENTIK_BOOTSTRAP_TOKEN=ak-bootstrap-token${NC} with your new token"
echo -e " 6. Run ${BLUE}make setup-sso${NC} again"
exit 1
fi
else
echo -e "${YELLOW}📋 Initial setup still required:${NC}"
echo -e " 1. Open ${BLUE}https://auth.local/if/flow/initial-setup/${NC}"
echo -e " 1. Open ${BLUE}https://auth.${DOMAIN}/if/flow/initial-setup/${NC}"
echo -e " 2. Complete the setup wizard with these credentials:"
echo -e " • Email: ${BLUE}$ADMIN_EMAIL${NC}"
echo -e " • Password: ${BLUE}$ADMIN_PASSWORD${NC}"

View File

@@ -6,22 +6,22 @@ set -e
echo "Creating external Docker networks..."
# Create frontend network (for Traefik and public-facing services)
if ! docker network ls | grep -q "ai-tax-agent-frontend"; then
docker network create ai-tax-agent-frontend
echo "✅ Created frontend network: ai-tax-agent-frontend"
if ! docker network ls | grep -q "apa-frontend"; then
docker network create apa-frontend
echo "✅ Created frontend network: apa-frontend"
else
echo " Frontend network already exists: ai-tax-agent-frontend"
echo " Frontend network already exists: apa-frontend"
fi
# Create backend network (for internal services)
if ! docker network ls | grep -q "ai-tax-agent-backend"; then
docker network create ai-tax-agent-backend
echo "✅ Created backend network: ai-tax-agent-backend"
if ! docker network ls | grep -q "apa-backend"; then
docker network create apa-backend
echo "✅ Created backend network: apa-backend"
else
echo " Backend network already exists: ai-tax-agent-backend"
echo " Backend network already exists: apa-backend"
fi
echo "🎉 Network setup complete!"
echo ""
echo "Networks created:"
docker network ls | grep "ai-tax-agent"
docker network ls | grep "apa-"

View File

@@ -1,101 +0,0 @@
#!/bin/bash
# Comprehensive Deployment Script with Fixes
# Handles the complete deployment process with all discovered fixes
set -e
COMPOSE_FILE="infra/compose/docker-compose.local.yml"
echo "🚀 Starting comprehensive deployment with fixes..."
# Step 1: Create networks
echo "🌐 Creating Docker networks..."
./scripts/create-networks.sh
# Step 2: Generate certificates
echo "🔐 Generating development certificates..."
./scripts/generate-dev-certs.sh
# Step 3: Start core infrastructure first
echo "🏗️ Starting core infrastructure..."
cd infra/compose
docker compose -f docker-compose.local.yml up -d ata-traefik ata-postgres ata-redis
cd ../..
# Step 4: Wait for core services and fix database issues
echo "⏳ Waiting for core services..."
sleep 15
./scripts/fix-database-issues.sh
# Step 5: Start Authentik components in order
echo "🔐 Starting Authentik components..."
cd infra/compose
docker compose -f docker-compose.local.yml up -d ata-authentik-db ata-authentik-redis
sleep 10
docker compose -f docker-compose.local.yml up -d ata-authentik-server
sleep 15
docker compose -f docker-compose.local.yml up -d ata-authentik-worker ata-authentik-outpost
cd ../..
# Step 6: Start remaining infrastructure
echo "🏗️ Starting remaining infrastructure..."
cd infra/compose
docker compose -f docker-compose.local.yml up -d ata-vault ata-neo4j ata-qdrant ata-minio ata-prometheus ata-grafana ata-loki
cd ../..
# Step 7: Wait and verify Authentik is healthy
echo "⏳ Waiting for Authentik to be healthy..."
timeout=120
counter=0
while [ "$(docker inspect --format='{{.State.Health.Status}}' ata-authentik-server 2>/dev/null)" != "healthy" ]; do
if [ $counter -ge $timeout ]; then
echo "❌ Authentik server failed to become healthy within $timeout seconds"
echo "📋 Checking logs..."
docker compose -f infra/compose/docker-compose.local.yml logs --tail=10 ata-authentik-server
exit 1
fi
sleep 2
counter=$((counter + 2))
echo "⏳ Waiting for Authentik... ($counter/$timeout seconds)"
done
echo "✅ Authentik is healthy"
# Step 8: Start application services
echo "🚀 Starting application services..."
cd infra/compose
docker compose -f docker-compose.local.yml up -d \
ata-svc-ingestion ata-svc-extract ata-svc-forms ata-svc-hmrc ata-svc-kg \
ata-svc-normalize-map ata-svc-ocr ata-svc-rag-indexer ata-svc-rag-retriever \
ata-svc-reason ata-svc-rpa ata-svc-firm-connectors ata-svc-coverage ata-ui-review
cd ../..
# Step 9: Start Unleash (may fail, but that's OK)
echo "📊 Starting Unleash (may require manual configuration)..."
cd infra/compose
docker compose -f docker-compose.local.yml up -d ata-unleash || echo "⚠️ Unleash failed to start - may need manual token configuration"
cd ../..
# Step 10: Final verification
echo "🔍 Running final verification..."
sleep 10
./scripts/verify-infra.sh || echo "⚠️ Some services may need additional configuration"
echo ""
echo "🎉 Deployment complete!"
echo ""
echo "📋 Next steps:"
echo " 1. Complete Authentik setup: https://auth.local/if/flow/initial-setup/"
echo " 2. Configure applications in Authentik admin panel"
echo " 3. Test protected services redirect to Authentik"
echo ""
echo "🌐 Available endpoints:"
echo " • Traefik Dashboard: http://localhost:8080"
echo " • Authentik: https://auth.local"
echo " • Grafana: https://grafana.local"
echo " • Review UI: https://review.local (requires Authentik setup)"
echo ""
echo "🔧 Troubleshooting:"
echo " • Check logs: make logs"
echo " • Check status: make status"
echo " • Restart services: make restart"

View File

@@ -32,52 +32,16 @@ bash "$ROOT_DIR/scripts/generate-dev-certs.sh"
# 4) Bring up core infra (detached)
echo "🏗️ Starting Traefik + core infra..."
docker compose -f "$COMPOSE_DIR/docker-compose.local.yml" up -d \
ata-traefik ata-authentik-db ata-authentik-redis ata-authentik-server ata-authentik-worker \
ata-vault ata-postgres ata-neo4j ata-qdrant ata-minio ata-redis ata-prometheus ata-grafana ata-loki
docker compose -f "$COMPOSE_DIR/compose.yaml" up -d \
apa-traefik apa-authentik-db apa-authentik-redis apa-authentik-server apa-authentik-worker \
apa-vault apa-postgres apa-neo4j apa-qdrant apa-minio apa-redis apa-prometheus apa-grafana apa-loki
# 5) Wait for Traefik, then Authentik (initial-setup or login)
echo "⏳ Waiting for Traefik to respond..."
for i in {1..60}; do
code=$(curl -s -o /dev/null -w '%{http_code}' http://localhost:8080/ping || true)
if [[ "$code" == "200" ]]; then echo "✅ Traefik reachable"; break; fi
sleep 2
if [[ "$i" == 60 ]]; then echo "❌ Traefik not ready"; exit 1; fi
done
echo "⏳ Waiting for Authentik to respond..."
AUTH_HOST="auth.${DOMAIN}"
RESOLVE=(--resolve "${AUTH_HOST}:443:127.0.0.1")
for i in {1..60}; do
code_setup=$(curl -ks "${RESOLVE[@]}" -o /dev/null -w '%{http_code}' "https://${AUTH_HOST}/if/flow/initial-setup/" || true)
code_login=$(curl -ks "${RESOLVE[@]}" -o /dev/null -w '%{http_code}' "https://${AUTH_HOST}/if/flow/default-authentication-flow/" || true)
code_root=$(curl -ks "${RESOLVE[@]}" -o /dev/null -w '%{http_code}' "https://${AUTH_HOST}/" || true)
# If initial-setup returns 404 but login/root are healthy, treat as ready (already initialized)
if [[ "$code_setup" == "404" ]]; then
if [[ "$code_login" =~ ^(200|302|401)$ || "$code_root" =~ ^(200|302|401)$ ]]; then
echo "✅ Authentik reachable (initial setup not present)"; break
fi
fi
# If any key flow says OK, proceed
if [[ "$code_setup" =~ ^(200|302|401)$ || "$code_login" =~ ^(200|302|401)$ || "$code_root" =~ ^(200|302|401)$ ]]; then
echo "✅ Authentik reachable"; break
fi
sleep 5
if [[ "$i" == 60 ]]; then echo "❌ Authentik not ready"; exit 1; fi
done
# 6) Setup Authentik (optional automated)
if [[ -n "${AUTHENTIK_BOOTSTRAP_TOKEN:-}" ]]; then
echo "🔧 Running Authentik setup with bootstrap token..."
AUTHENTIK_API_TOKEN="$AUTHENTIK_BOOTSTRAP_TOKEN" DOMAIN="$DOMAIN" bash "$ROOT_DIR/scripts/setup-authentik.sh" || true
else
echo " No AUTHENTIK_BOOTSTRAP_TOKEN provided; skipping automated Authentik API setup"
fi
# ... (lines 40-79 skipped for brevity in replacement, but context maintained)
# 7) Start Authentik outpost if token present
if [[ -n "${AUTHENTIK_OUTPOST_TOKEN:-}" && "${AUTHENTIK_OUTPOST_TOKEN}" != "changeme" ]]; then
echo "🔐 Starting Authentik outpost..."
docker compose -f "$COMPOSE_DIR/docker-compose.local.yml" up -d ata-authentik-outpost || true
docker compose -f "$COMPOSE_DIR/compose.yaml" up -d apa-authentik-outpost || true
else
echo " Set AUTHENTIK_OUTPOST_TOKEN in $COMPOSE_DIR/.env to start authentik-outpost"
fi
@@ -85,10 +49,10 @@ fi
# 8) Start application services (optional)
if [[ "${START_APP_SERVICES:-true}" == "true" ]]; then
echo "🚀 Starting application services..."
docker compose -f "$COMPOSE_DIR/docker-compose.local.yml" up -d \
ata-svc-ingestion ata-svc-extract ata-svc-kg ata-svc-rag-retriever ata-svc-coverage \
ata-svc-firm-connectors ata-svc-forms ata-svc-hmrc ata-svc-normalize-map ata-svc-ocr \
ata-svc-rag-indexer ata-svc-reason ata-svc-rpa ata-ui-review ata-unleash || true
docker compose -f "$COMPOSE_DIR/compose.yaml" up -d \
apa-svc-ingestion apa-svc-extract apa-svc-kg apa-svc-rag-retriever apa-svc-coverage \
apa-svc-firm-connectors apa-svc-forms apa-svc-hmrc apa-svc-normalize-map apa-svc-ocr \
apa-svc-rag-indexer apa-svc-reason apa-svc-rpa apa-unleash || true
fi
echo "🎉 Dev environment is up"

View File

@@ -11,7 +11,7 @@ echo "🔧 Fixing database issues..."
echo "⏳ Waiting for PostgreSQL to be ready..."
timeout=60
counter=0
while ! docker exec ata-postgres pg_isready -U postgres >/dev/null 2>&1; do
while ! docker exec apa-postgres pg_isready -U postgres >/dev/null 2>&1; do
if [ $counter -ge $timeout ]; then
echo "❌ PostgreSQL failed to start within $timeout seconds"
exit 1
@@ -21,16 +21,29 @@ while ! docker exec ata-postgres pg_isready -U postgres >/dev/null 2>&1; do
done
echo "✅ PostgreSQL is ready"
# Create unleash database if it doesn't exist
echo "📊 Creating unleash database if needed..."
docker exec ata-postgres psql -U postgres -tc "SELECT 1 FROM pg_database WHERE datname = 'unleash'" | grep -q 1 || \
docker exec ata-postgres psql -U postgres -c "CREATE DATABASE unleash;"
echo "✅ Unleash database ready"
# Create unleash database and user if they don't exist
echo "📊 Creating unleash database and user if needed..."
docker exec apa-postgres psql -U postgres -d template1 -tc "SELECT 1 FROM pg_database WHERE datname = 'unleash'" | grep -q 1 || \
docker exec apa-postgres psql -U postgres -d template1 -c "CREATE DATABASE unleash;"
docker exec apa-postgres psql -U postgres -d template1 -tc "SELECT 1 FROM pg_user WHERE usename = 'unleash'" | grep -q 1 || \
docker exec apa-postgres psql -U postgres -d template1 -c "CREATE USER unleash WITH PASSWORD 'unleash';"
docker exec apa-postgres psql -U postgres -d template1 -c "GRANT ALL PRIVILEGES ON DATABASE unleash TO unleash;"
echo "✅ Unleash database and user ready"
# Create tax_system database for Authentik if needed
echo "🔐 Creating tax_system database for Authentik if needed..."
docker exec ata-postgres psql -U postgres -tc "SELECT 1 FROM pg_database WHERE datname = 'tax_system'" | grep -q 1 || \
docker exec ata-postgres psql -U postgres -c "CREATE DATABASE tax_system;"
docker exec apa-postgres psql -U postgres -d template1 -tc "SELECT 1 FROM pg_database WHERE datname = 'tax_system'" | grep -q 1 || \
docker exec apa-postgres psql -U postgres -d template1 -c "CREATE DATABASE tax_system;"
docker exec apa-postgres psql -U postgres -d template1 -tc "SELECT 1 FROM pg_database WHERE datname = 'authentik'" | grep -q 1 || \
docker exec apa-postgres psql -U postgres -d template1 -c "CREATE DATABASE authentik;"
echo "✅ Authentik database ready"
# Create authentik user if it doesn't exist
echo "🔐 Creating authentik user if needed..."
docker exec apa-postgres psql -U postgres -d template1 -tc "SELECT 1 FROM pg_user WHERE usename = 'authentik'" | grep -q 1 || \
docker exec apa-postgres psql -U postgres -d template1 -c "CREATE USER authentik WITH PASSWORD 'authentik';"
docker exec apa-postgres psql -U postgres -d template1 -c "GRANT ALL PRIVILEGES ON DATABASE tax_system TO authentik;"
docker exec apa-postgres psql -U postgres -d template1 -c "GRANT ALL PRIVILEGES ON DATABASE authentik TO authentik;"
echo "✅ Authentik user ready"
echo "🎉 Database issues fixed!"

View File

@@ -13,51 +13,38 @@ NC='\033[0m' # No Color
# Function to generate random string
generate_secret() {
local length=${1:-32}
openssl rand -base64 $length | tr -d "=+/" | cut -c1-$length
openssl rand -base64 "$length" | tr -d "=+/\n" | cut -c1-"$length"
}
# Function to generate UUID
generate_uuid() {
python3 -c "import uuid; print(uuid.uuid4())"
python3 - <<'PY'
import uuid
print(uuid.uuid4())
PY
}
echo -e "${BLUE}🔐 Generating secure secrets for AI Tax Agent...${NC}"
echo
write_env() {
local file=$1
local tmp="$file.tmp"
local ts
ts="$(date +%Y%m%d_%H%M%S)"
# Generate secrets
AUTHENTIK_SECRET_KEY=$(generate_secret 50)
AUTHENTIK_OUTPOST_TOKEN=$(generate_secret 64)
AUTHENTIK_API_CLIENT_SECRET=$(generate_secret 32)
AUTHENTIK_GRAFANA_CLIENT_SECRET=$(generate_secret 32)
GRAFANA_OAUTH_CLIENT_SECRET=$(generate_secret 32)
NEXTAUTH_SECRET=$(generate_secret 32)
VAULT_DEV_ROOT_TOKEN_ID=$(generate_uuid)
POSTGRES_PASSWORD=$(generate_secret 16)
NEO4J_PASSWORD=$(generate_secret 16)
AUTHENTIK_DB_PASSWORD=$(generate_secret 16)
MINIO_ROOT_PASSWORD=$(generate_secret 16)
GRAFANA_PASSWORD=$(generate_secret 16)
if [ -f "$file" ]; then
cp "$file" "${file}.backup.${ts}"
echo -e "${YELLOW}📋 Backed up existing env to ${file}.backup.${ts}${NC}"
fi
# Create .env file with generated secrets
ENV_FILE="infra/compose/.env"
BACKUP_FILE="infra/compose/.env.backup.$(date +%Y%m%d_%H%M%S)"
# Backup existing .env if it exists
if [ -f "$ENV_FILE" ]; then
echo -e "${YELLOW}📋 Backing up existing .env to $BACKUP_FILE${NC}"
cp "$ENV_FILE" "$BACKUP_FILE"
fi
echo -e "${GREEN}🔑 Generating new .env file with secure secrets...${NC}"
cat > "$ENV_FILE" << EOF
cat > "$tmp" << EOF
# AI Tax Agent Environment Configuration
# Generated on $(date)
# IMPORTANT: Keep these secrets secure and never commit to version control
# Domain Configuration
DOMAIN=local
EMAIL=admin@local
DOMAIN=${DOMAIN:-local.lan}
EMAIL=${EMAIL:-admin@local.lan}
ACME_EMAIL=${ACME_EMAIL:-${EMAIL:-admin@local.lan}}
TRAEFIK_CERT_RESOLVER=${TRAEFIK_CERT_RESOLVER:-}
# Database Passwords
POSTGRES_PASSWORD=$POSTGRES_PASSWORD
@@ -65,11 +52,13 @@ NEO4J_PASSWORD=$NEO4J_PASSWORD
AUTHENTIK_DB_PASSWORD=$AUTHENTIK_DB_PASSWORD
# Object Storage
MINIO_ROOT_USER=minio
MINIO_ROOT_USER=${MINIO_ROOT_USER:-minio}
MINIO_ROOT_PASSWORD=$MINIO_ROOT_PASSWORD
MINIO_ACCESS_KEY=${MINIO_ACCESS_KEY:-$MINIO_ROOT_USER}
MINIO_SECRET_KEY=${MINIO_SECRET_KEY:-$MINIO_ROOT_PASSWORD}
# Vector Database
QDRANT__SERVICE__GRPC_PORT=6334
QDRANT__SERVICE__GRPC_PORT=${QDRANT__SERVICE__GRPC_PORT:-6334}
# Secrets Management
VAULT_DEV_ROOT_TOKEN_ID=$VAULT_DEV_ROOT_TOKEN_ID
@@ -77,90 +66,147 @@ VAULT_DEV_ROOT_TOKEN_ID=$VAULT_DEV_ROOT_TOKEN_ID
# Identity & SSO
AUTHENTIK_SECRET_KEY=$AUTHENTIK_SECRET_KEY
AUTHENTIK_OUTPOST_TOKEN=$AUTHENTIK_OUTPOST_TOKEN
AUTHENTIK_BOOTSTRAP_EMAIL=admin@local.lan
AUTHENTIK_BOOTSTRAP_PASSWORD=admin123
AUTHENTIK_BOOTSTRAP_TOKEN=ak-bootstrap-token
AUTHENTIK_BOOTSTRAP_EMAIL=${AUTHENTIK_BOOTSTRAP_EMAIL:-admin@${DOMAIN:-local.lan}}
AUTHENTIK_BOOTSTRAP_PASSWORD=${AUTHENTIK_BOOTSTRAP_PASSWORD:-admin123}
AUTHENTIK_BOOTSTRAP_TOKEN=${AUTHENTIK_BOOTSTRAP_TOKEN:-ak-bootstrap-token}
AUTHENTIK_API_CLIENT_SECRET=$AUTHENTIK_API_CLIENT_SECRET
AUTHENTIK_UI_REVIEW_CLIENT_SECRET=$AUTHENTIK_UI_REVIEW_CLIENT_SECRET
AUTHENTIK_GRAFANA_CLIENT_SECRET=$AUTHENTIK_GRAFANA_CLIENT_SECRET
AUTHENTIK_MINIO_CLIENT_SECRET=$AUTHENTIK_MINIO_CLIENT_SECRET
AUTHENTIK_VAULT_CLIENT_SECRET=$AUTHENTIK_VAULT_CLIENT_SECRET
# OAuth Client Secrets
GRAFANA_OAUTH_CLIENT_ID=grafana
GRAFANA_OAUTH_CLIENT_ID=${GRAFANA_OAUTH_CLIENT_ID:-grafana}
GRAFANA_OAUTH_CLIENT_SECRET=$GRAFANA_OAUTH_CLIENT_SECRET
# Monitoring
GRAFANA_PASSWORD=$GRAFANA_PASSWORD
# Feature Flags
UNLEASH_ADMIN_TOKEN=admin:development.unleash-insecure-admin-api-token
UNLEASH_ADMIN_TOKEN=$UNLEASH_ADMIN_TOKEN
# Application Configuration
NEXTAUTH_SECRET=$NEXTAUTH_SECRET
JWT_SECRET=$JWT_SECRET
ENCRYPTION_KEY=$ENCRYPTION_KEY
# Event Bus / NATS
EVENT_BUS_TYPE=${EVENT_BUS_TYPE:-nats}
NATS_SERVERS=${NATS_SERVERS:-nats://apa-nats:4222}
NATS_STREAM_NAME=${NATS_STREAM_NAME:-TAX_AGENT_EVENTS}
NATS_CONSUMER_GROUP=${NATS_CONSUMER_GROUP:-tax-agent}
NATS_LOG_LEVEL=${NATS_LOG_LEVEL:-info}
# Redis Configuration
REDIS_PASSWORD=$REDIS_PASSWORD
# RAG & ML Models
RAG_EMBEDDING_MODEL=bge-small-en-v1.5
RAG_RERANKER_MODEL=cross-encoder/ms-marco-MiniLM-L-6-v2
RAG_ALPHA_BETA_GAMMA=0.5,0.3,0.2
RAG_EMBEDDING_MODEL=${RAG_EMBEDDING_MODEL:-bge-small-en-v1.5}
RAG_RERANKER_MODEL=${RAG_RERANKER_MODEL:-cross-encoder/ms-marco-MiniLM-L-6-v2}
RAG_ALPHA_BETA_GAMMA=${RAG_ALPHA_BETA_GAMMA:-0.5,0.3,0.2}
# HMRC Integration
HMRC_MTD_ITSA_MODE=sandbox
HMRC_MTD_ITSA_MODE=${HMRC_MTD_ITSA_MODE:-sandbox}
# Rate Limits
RATE_LIMITS_HMRC_API_RPS=3
RATE_LIMITS_HMRC_API_BURST=6
RATE_LIMITS_LLM_API_RPS=10
RATE_LIMITS_LLM_API_BURST=20
RATE_LIMITS_HMRC_API_RPS=${RATE_LIMITS_HMRC_API_RPS:-3}
RATE_LIMITS_HMRC_API_BURST=${RATE_LIMITS_HMRC_API_BURST:-6}
RATE_LIMITS_LLM_API_RPS=${RATE_LIMITS_LLM_API_RPS:-10}
RATE_LIMITS_LLM_API_BURST=${RATE_LIMITS_LLM_API_BURST:-20}
# Confidence Thresholds
CONFIDENCE_AUTO_SUBMIT=0.95
CONFIDENCE_HUMAN_REVIEW=0.85
CONFIDENCE_REJECT=0.50
CONFIDENCE_AUTO_SUBMIT=${CONFIDENCE_AUTO_SUBMIT:-0.95}
CONFIDENCE_HUMAN_REVIEW=${CONFIDENCE_HUMAN_REVIEW:-0.85}
CONFIDENCE_REJECT=${CONFIDENCE_REJECT:-0.50}
# Logging
LOG_LEVEL=INFO
LOG_FORMAT=json
LOG_LEVEL=${LOG_LEVEL:-INFO}
LOG_FORMAT=${LOG_FORMAT:-json}
# Development Settings
DEBUG=false
DEVELOPMENT_MODE=true
DEBUG=${DEBUG:-false}
DEVELOPMENT_MODE=${DEVELOPMENT_MODE:-true}
# Security
ENCRYPTION_KEY_ID=default
AUDIT_LOG_RETENTION_DAYS=90
PII_LOG_RETENTION_DAYS=30
ENCRYPTION_KEY_ID=${ENCRYPTION_KEY_ID:-default}
AUDIT_LOG_RETENTION_DAYS=${AUDIT_LOG_RETENTION_DAYS:-90}
PII_LOG_RETENTION_DAYS=${PII_LOG_RETENTION_DAYS:-30}
# Backup & DR
BACKUP_ENABLED=true
BACKUP_SCHEDULE=0 2 * * *
BACKUP_RETENTION_DAYS=30
BACKUP_ENABLED=${BACKUP_ENABLED:-true}
BACKUP_SCHEDULE="${BACKUP_SCHEDULE:-0 2 * * *}"
BACKUP_RETENTION_DAYS=${BACKUP_RETENTION_DAYS:-30}
# Performance Tuning
MAX_WORKERS=4
BATCH_SIZE=100
CACHE_TTL_SECONDS=3600
CONNECTION_POOL_SIZE=20
MAX_WORKERS=${MAX_WORKERS:-4}
BATCH_SIZE=${BATCH_SIZE:-100}
CACHE_TTL_SECONDS=${CACHE_TTL_SECONDS:-3600}
CONNECTION_POOL_SIZE=${CONNECTION_POOL_SIZE:-20}
# Registry / build
REGISTRY=${REGISTRY:-localhost:5000}
REGISTRY_USER=${REGISTRY_USER:-admin}
REGISTRY_PASSWORD=${REGISTRY_PASSWORD:-admin123}
IMAGE_TAG=${IMAGE_TAG:-latest}
OWNER=${OWNER:-local}
# Feature Flags
FEATURE_RAG_ENABLED=true
FEATURE_FIRM_CONNECTORS_ENABLED=false
FEATURE_HMRC_SUBMISSION_ENABLED=false
FEATURE_ADVANCED_CALCULATIONS_ENABLED=true
FEATURE_RAG_ENABLED=${FEATURE_RAG_ENABLED:-true}
FEATURE_FIRM_CONNECTORS_ENABLED=${FEATURE_FIRM_CONNECTORS_ENABLED:-false}
FEATURE_HMRC_SUBMISSION_ENABLED=${FEATURE_HMRC_SUBMISSION_ENABLED:-false}
FEATURE_ADVANCED_CALCULATIONS_ENABLED=${FEATURE_ADVANCED_CALCULATIONS_ENABLED:-true}
# API Keys (placeholders for local testing)
OPENAI_API_KEY=${OPENAI_API_KEY:-sk-local-placeholder}
ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY:-sk-ant-local-placeholder}
EOF
# Set secure permissions
chmod 600 "$ENV_FILE"
mv "$tmp" "$file"
chmod 600 "$file"
echo -e "${GREEN}✅ Wrote secrets to $file${NC}"
}
echo -e "${BLUE}🔐 Generating secure secrets for AI Tax Agent...${NC}"
echo
# Generate secrets (random where appropriate)
AUTHENTIK_SECRET_KEY=$(generate_secret 50)
AUTHENTIK_OUTPOST_TOKEN=$(generate_secret 64)
AUTHENTIK_API_CLIENT_SECRET=$(generate_secret 32)
AUTHENTIK_UI_REVIEW_CLIENT_SECRET=$(generate_secret 32)
AUTHENTIK_GRAFANA_CLIENT_SECRET=$(generate_secret 32)
AUTHENTIK_MINIO_CLIENT_SECRET=$(generate_secret 32)
AUTHENTIK_VAULT_CLIENT_SECRET=$(generate_secret 32)
GRAFANA_OAUTH_CLIENT_SECRET=$(generate_secret 32)
NEXTAUTH_SECRET=$(generate_secret 48)
JWT_SECRET=$(generate_secret 48)
ENCRYPTION_KEY=$(generate_secret 32)
VAULT_DEV_ROOT_TOKEN_ID=$(generate_uuid)
POSTGRES_PASSWORD=$(generate_secret 16)
NEO4J_PASSWORD=$(generate_secret 16)
AUTHENTIK_DB_PASSWORD=$(generate_secret 16)
MINIO_ROOT_PASSWORD=$(generate_secret 16)
MINIO_ACCESS_KEY=$(generate_secret 16)
MINIO_SECRET_KEY=$(generate_secret 24)
GRAFANA_PASSWORD=$(generate_secret 16)
UNLEASH_ADMIN_TOKEN="admin:$(generate_secret 24)"
REDIS_PASSWORD=$(generate_secret 16)
# Defaults for commonly overridden values
DOMAIN=${DOMAIN:-local.lan}
EMAIL=${EMAIL:-admin@${DOMAIN}}
ACME_EMAIL=${ACME_EMAIL:-$EMAIL}
# Write env file
write_env "infra/environments/local/.env"
echo -e "${GREEN}✅ Secrets generated successfully!${NC}"
echo
echo -e "${YELLOW}📝 Important credentials:${NC}"
echo -e " ${BLUE}Grafana Admin:${NC} admin / $GRAFANA_PASSWORD"
echo -e " ${BLUE}Authentik Admin:${NC} admin@local (set password on first login)"
echo -e " ${BLUE}MinIO Admin:${NC} ${MINIO_ROOT_USER:-minio} / $MINIO_ROOT_PASSWORD"
echo -e " ${BLUE}Vault Root Token:${NC} $VAULT_DEV_ROOT_TOKEN_ID"
echo -e " ${BLUE}MinIO Admin:${NC} minio / $MINIO_ROOT_PASSWORD"
echo -e " ${BLUE}Authentik Bootstrap:${NC} ${AUTHENTIK_BOOTSTRAP_EMAIL:-admin@${DOMAIN}} / ${AUTHENTIK_BOOTSTRAP_PASSWORD:-admin123}"
echo
echo -e "${RED}⚠️ SECURITY WARNING:${NC}"
echo -e " • Keep the .env file secure and never commit it to version control"
echo -e " • Change default passwords on first login"
echo -e " • Use proper secrets management in production"
echo -e " • Regularly rotate secrets"
echo
echo -e "${GREEN}🚀 Ready to deploy with: make deploy-infra${NC}"
echo -e " • Keep the generated env files secure and out of version control"
echo -e " • Rotate secrets regularly for non-local environments"

View File

@@ -11,12 +11,17 @@ BLUE='\033[0;34m'
NC='\033[0m' # No Color
# Configuration
# Load environment variables
if [ -f "infra/compose/.env" ]; then
source "infra/compose/.env"
fi
DOMAIN=${DOMAIN:-local}
AUTHENTIK_URL="https://auth.${DOMAIN}"
AUTHENTIK_API_URL="$AUTHENTIK_URL/api/v3"
ADMIN_EMAIL="admin@local"
ADMIN_EMAIL="admin@${DOMAIN}"
ADMIN_PASSWORD="${AUTHENTIK_ADMIN_PASSWORD:-admin123}"
BOOTSTRAP_FILE="infra/compose/authentik/bootstrap.yaml"
BOOTSTRAP_FILE="infra/authentik/bootstrap.yaml"
echo -e "${BLUE}🔧 Setting up Authentik SSO for AI Tax Agent using Blueprint Import...${NC}"
echo
@@ -76,17 +81,17 @@ generate_secrets() {
# Function to get API token
get_api_token() {
echo -e "${YELLOW}🔑 Getting API token...${NC}"
echo -e "${YELLOW}🔑 Getting API token...${NC}" >&2
# Use bootstrap token if available
if [ -n "${AUTHENTIK_BOOTSTRAP_TOKEN:-}" ]; then
# Use bootstrap token if available and valid
if [ -n "${AUTHENTIK_BOOTSTRAP_TOKEN:-}" ] && [ "$AUTHENTIK_BOOTSTRAP_TOKEN" != "ak-bootstrap-token" ]; then
echo "$AUTHENTIK_BOOTSTRAP_TOKEN"
return 0
fi
# Try to get token via API (requires manual setup first)
local token_response
token_response=$(curl -s -X POST "$AUTHENTIK_API_URL/core/tokens/" \
token_response=$(curl -ks -X POST "$AUTHENTIK_API_URL/core/tokens/" \
-H "Content-Type: application/json" \
-u "$ADMIN_EMAIL:$ADMIN_PASSWORD" \
-d '{
@@ -115,12 +120,12 @@ import_blueprint() {
# Create blueprint instance
local blueprint_response
blueprint_response=$(curl -s -X POST "$AUTHENTIK_API_URL/managed/blueprints/" \
blueprint_response=$(curl -k -X POST "$AUTHENTIK_API_URL/managed/blueprints/" \
-H "Content-Type: application/json" \
-H "Authorization: Bearer $token" \
-d '{
"name": "AI Tax Agent Bootstrap",
"path": "/blueprints/bootstrap.yaml",
"path": "ai-tax-agent-bootstrap.yaml",
"context": {},
"enabled": true
}' 2>/dev/null || echo "")
@@ -128,22 +133,60 @@ import_blueprint() {
local blueprint_pk
blueprint_pk=$(echo "$blueprint_response" | python3 -c "import sys, json; print(json.load(sys.stdin).get('pk', ''))" 2>/dev/null || echo "")
if [ -z "$blueprint_pk" ]; then
echo -e "${YELLOW}⚠️ Could not create blueprint. It might already exist. Trying to find it...${NC}"
local existing_bp
existing_bp=$(curl -k -X GET "$AUTHENTIK_API_URL/managed/blueprints/?name=AI%20Tax%20Agent%20Bootstrap" \
-H "Authorization: Bearer $token" 2>/dev/null || echo "")
blueprint_pk=$(echo "$existing_bp" | python3 -c "import sys, json; print(json.load(sys.stdin)['results'][0]['pk'])" 2>/dev/null || echo "")
fi
if [ -n "$blueprint_pk" ]; then
echo -e "${GREEN}✅ Blueprint created with ID: $blueprint_pk${NC}"
# Apply the blueprint
echo -e "${YELLOW}🔄 Applying blueprint...${NC}"
local apply_response
apply_response=$(curl -s -X POST "$AUTHENTIK_API_URL/managed/blueprints/$blueprint_pk/apply/" \
apply_response=$(curl -k -X POST "$AUTHENTIK_API_URL/managed/blueprints/$blueprint_pk/apply/" \
-H "Content-Type: application/json" \
-H "Authorization: Bearer $token" \
-d '{}' 2>/dev/null || echo "")
if echo "$apply_response" | grep -q "success\|applied" 2>/dev/null; then
echo -e "${GREEN}✅ Blueprint applied successfully${NC}"
echo -e "${GREEN}✅ Blueprint applied successfully${NC}"
# Force-sync the Outpost token
# The blueprint might fail to update the token for the existing embedded outpost, so we do it explicitly.
echo -e "${YELLOW}🔄 Syncing Outpost token...${NC}"
if docker exec -i apa-authentik-server python3 /manage.py shell -c "
from authentik.outposts.models import Outpost
from authentik.core.models import Token
import os
try:
token_key = os.environ.get('AUTHENTIK_OUTPOST_TOKEN')
if token_key:
o = Outpost.objects.get(name='authentik Embedded Outpost')
t = Token.objects.get(pk=o.token.pk)
if t.key != token_key:
t.key = token_key
t.save()
print('Token updated')
else:
print('Token already matches')
else:
print('No AUTHENTIK_OUTPOST_TOKEN found in environment')
except Exception as e:
print(f'Error updating token: {e}')
exit(1)
" > /dev/null; then
echo -e "${GREEN}✅ Outpost token synced${NC}"
# Restart outpost to pick up changes if needed (though it reads from env, so mostly for connection retry)
docker restart apa-authentik-outpost > /dev/null 2>&1 || true
else
echo -e "${YELLOW}⚠️ Blueprint application may have had issues. Check Authentik logs.${NC}"
echo -e "${RED}❌ Failed to sync Outpost token${NC}"
fi
else
echo -e "${RED}❌ Failed to create blueprint${NC}"
return 1
@@ -186,23 +229,25 @@ main() {
exit 1
fi
# Check if initial setup is needed
local host
host=$(echo "$AUTHENTIK_URL" | sed -E 's#^https?://([^/]+).*$#\1#')
local resolve=(--resolve "${host}:443:127.0.0.1")
local setup_code
setup_code=$(curl -ks "${resolve[@]}" -o /dev/null -w '%{http_code}' "$AUTHENTIK_URL/if/flow/initial-setup/" || true)
# Check if initial setup is needed (only if we don't have a token)
if [ -z "${AUTHENTIK_BOOTSTRAP_TOKEN:-}" ] || [ "$AUTHENTIK_BOOTSTRAP_TOKEN" == "ak-bootstrap-token" ]; then
local host
host=$(echo "$AUTHENTIK_URL" | sed -E 's#^https?://([^/]+).*$#\1#')
local resolve=(--resolve "${host}:443:127.0.0.1")
local setup_code
setup_code=$(curl -ks "${resolve[@]}" -o /dev/null -w '%{http_code}' "$AUTHENTIK_URL/if/flow/initial-setup/" || true)
if [[ "$setup_code" == "200" ]]; then
echo -e "${YELLOW}📋 Initial Authentik setup required:${NC}"
echo -e " 1. Open ${BLUE}https://auth.local/if/flow/initial-setup/${NC}"
echo -e " 2. Complete the setup wizard with admin user"
echo -e " 3. Re-run this script after setup is complete"
echo
echo -e "${BLUE}💡 Tip: Use these credentials:${NC}"
echo -e " • Email: ${BLUE}$ADMIN_EMAIL${NC}"
echo -e " • Password: ${BLUE}$ADMIN_PASSWORD${NC}"
return 0
if [[ "$setup_code" == "200" ]]; then
echo -e "${YELLOW}📋 Initial Authentik setup required:${NC}"
echo -e " 1. Open ${BLUE}https://auth.${DOMAIN}/if/flow/initial-setup/${NC}"
echo -e " 2. Complete the setup wizard with admin user"
echo -e " 3. Re-run this script after setup is complete"
echo
echo -e "${BLUE}💡 Tip: Use these credentials:${NC}"
echo -e " • Email: ${BLUE}$ADMIN_EMAIL${NC}"
echo -e " • Password: ${BLUE}$ADMIN_PASSWORD${NC}"
return 0
fi
fi
# Try to get API token
@@ -231,7 +276,7 @@ main() {
fi
else
echo -e "${YELLOW}📋 Could not obtain API token. Manual configuration required:${NC}"
echo -e " 1. Open ${BLUE}https://auth.local${NC} and log in as admin"
echo -e " 1. Open ${BLUE}https://auth.local.lan${NC} and log in as admin"
echo -e " 2. Go to Admin Interface > Tokens"
echo -e " 3. Create a new token and set AUTHENTIK_BOOTSTRAP_TOKEN in .env"
echo -e " 4. Re-run this script"
@@ -239,10 +284,10 @@ main() {
echo
echo -e "${BLUE}🔗 Access URLs:${NC}"
echo -e " • Authentik Admin: ${BLUE}https://auth.local${NC}"
echo -e " • API Gateway: ${BLUE}https://api.local${NC}"
echo -e " • Grafana: ${BLUE}https://grafana.local${NC}"
echo -e " • Review Portal: ${BLUE}https://review.local${NC}"
echo -e " • Authentik Admin: ${BLUE}https://auth.local.lan${NC}"
echo -e " • API Gateway: ${BLUE}https://api.local.lan${NC}"
echo -e " • Grafana: ${BLUE}https://grafana.local.lan${NC}"
echo -e " • Review Portal: ${BLUE}https://review.local.lan${NC}"
}
# Run main function

106
scripts/setup-vault.sh Executable file
View File

@@ -0,0 +1,106 @@
#!/bin/bash
# Setup Vault OIDC Authentication
set -euo pipefail
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
# Load environment variables
if [ -f "infra/compose/.env" ]; then
source "infra/compose/.env"
fi
DOMAIN=${DOMAIN:-local.lan}
VAULT_ADDR="http://localhost:8200"
AUTHENTIK_URL="https://auth.${DOMAIN}"
echo -e "${BLUE}🔧 Setting up Vault OIDC Authentication...${NC}"
# Function to check if Vault is ready
wait_for_vault() {
echo -e "${YELLOW}⏳ Waiting for Vault to be ready...${NC}"
local max_attempts=30
local attempt=1
while [ $attempt -le $max_attempts ]; do
if docker exec -e VAULT_ADDR=http://127.0.0.1:8200 apa-vault vault status > /dev/null 2>&1; then
echo -e "${GREEN}✅ Vault is ready!${NC}"
return 0
fi
echo -n "."
sleep 2
attempt=$((attempt + 1))
done
echo -e "${RED}❌ Vault failed to start${NC}"
return 1
}
# Main setup function
setup_vault() {
# Check if we have the root token
if [ -z "${VAULT_DEV_ROOT_TOKEN_ID:-}" ]; then
echo -e "${RED}❌ VAULT_DEV_ROOT_TOKEN_ID not found in environment${NC}"
return 1
fi
# Check if we have the client secret
if [ -z "${AUTHENTIK_VAULT_CLIENT_SECRET:-}" ]; then
echo -e "${RED}❌ AUTHENTIK_VAULT_CLIENT_SECRET not found in environment${NC}"
return 1
fi
# Execute commands inside the Vault container
echo -e "${YELLOW}🔐 Configuring Vault OIDC...${NC}"
# Login
docker exec -e VAULT_ADDR=http://127.0.0.1:8200 apa-vault vault login "$VAULT_DEV_ROOT_TOKEN_ID" > /dev/null
# Enable OIDC auth method (ignore error if already enabled)
docker exec -e VAULT_ADDR=http://127.0.0.1:8200 apa-vault vault auth enable oidc 2>/dev/null || true
echo -e "${GREEN}✅ OIDC auth enabled${NC}"
# Configure OIDC
# Note: We use the internal Docker network URL for discovery if possible, or the public one if Vault can resolve it.
# Since Vault is in the backend network, it can reach 'apa-authentik-server'.
# However, the discovery URL usually needs to match what the user sees (issuer validation).
# Authentik's issuer is usually the slug URL.
# Using the public URL for discovery URL as per standard OIDC validation
# We might need to ensure Vault container can resolve auth.local.lan to the Traefik IP or Authentik IP.
# In our setup, auth.local.lan resolves to 127.0.0.1 on host. Inside container, it needs to resolve to the gateway or authentik.
# For now, let's try using the public URL. If it fails, we might need to add a host alias to the Vault container.
docker exec -e VAULT_ADDR=http://127.0.0.1:8200 apa-vault vault write auth/oidc/config \
oidc_discovery_url="$AUTHENTIK_URL/application/o/vault-oidc/" \
oidc_client_id="vault" \
oidc_client_secret="$AUTHENTIK_VAULT_CLIENT_SECRET" \
default_role="reader" \
bound_issuer="localhost" \
oidc_discovery_ca_pem=@/certs/local.crt
echo -e "${GREEN}✅ OIDC config written${NC}"
# Create reader role
docker exec -e VAULT_ADDR=http://127.0.0.1:8200 apa-vault vault write auth/oidc/role/reader \
bound_audiences="vault" \
allowed_redirect_uris="https://vault.${DOMAIN}/ui/vault/auth/oidc/oidc/callback,https://vault.${DOMAIN}/oidc/callback,http://localhost:8250/oidc/callback" \
oidc_scopes="openid,email,profile" \
user_claim="email" \
policies="default" \
ttl="1h"
echo -e "${GREEN}✅ OIDC role 'reader' created${NC}"
echo
echo -e "${GREEN}🎉 Vault OIDC setup complete!${NC}"
echo -e " Login at: ${BLUE}https://vault.${DOMAIN}/ui/vault/auth/oidc/oidc/callback${NC}"
}
# Run
wait_for_vault
setup_vault

View File

@@ -0,0 +1,76 @@
import asyncio
import httpx
import pytest
from libs.events import EventTopics, NATSEventBus
from libs.schemas.events import DocumentExtractedEventData
# Configuration
INGESTION_URL = "http://localhost:8000"
NATS_URL = "nats://localhost:4222"
TENANT_ID = "tenant_e2e_test"
@pytest.mark.e2e
@pytest.mark.asyncio
async def test_backend_journey():
"""
E2E test for the full backend journey: Ingest -> OCR -> Extract.
"""
# 1. Initialize NATS bus
bus = NATSEventBus(
servers=[NATS_URL],
stream_name="TAX_AGENT_EVENTS",
consumer_group="e2e-test-consumer",
)
await bus.start()
# Future to capture the final event
extraction_future = asyncio.Future()
async def extraction_handler(topic, payload):
if payload.tenant_id == TENANT_ID:
extraction_future.set_result(payload)
# Subscribe to the final event in the chain
await bus.subscribe(EventTopics.DOC_EXTRACTED, extraction_handler)
try:
# 2. Upload a document
async with httpx.AsyncClient() as client:
# Create a dummy PDF file
files = {"file": ("test.pdf", b"%PDF-1.4 mock content", "application/pdf")}
response = await client.post(
f"{INGESTION_URL}/upload",
files=files,
data={"kind": "invoice", "source": "e2e_test"},
headers={"X-Tenant-ID": TENANT_ID, "X-User-ID": "e2e_tester"},
)
assert response.status_code == 200, f"Upload failed: {response.text}"
upload_data = response.json()
doc_id = upload_data["doc_id"]
print(f"Uploaded document: {doc_id}")
# 3. Wait for extraction event (with timeout)
try:
# Give it enough time for the whole chain to process
payload = await asyncio.wait_for(extraction_future, timeout=30.0)
# 4. Verify payload
data = payload.data
assert data["doc_id"] == doc_id
assert data["tenant_id"] == TENANT_ID
assert "extraction_results" in data
# Validate against schema
event_data = DocumentExtractedEventData(**data)
assert event_data.doc_id == doc_id
print("E2E Journey completed successfully!")
except TimeoutError:
pytest.fail("Timed out waiting for extraction event")
finally:
await bus.stop()

View File

@@ -0,0 +1,39 @@
import pytest
from libs.events import EventTopics
from libs.schemas.events import DocumentIngestedEventData, validate_event_data
@pytest.mark.integration
def test_doc_ingested_contract():
"""
Contract test for DOC_INGESTED event.
Verifies that the event data schema matches the expected Pydantic model.
"""
# Sample valid payload data
valid_data = {
"doc_id": "doc_01H1V2W3X4Y5Z6",
"filename": "test.pdf",
"kind": "invoice",
"source": "upload",
"checksum_sha256": "a" * 64,
"size_bytes": 1024,
"mime_type": "application/pdf",
"storage_path": "s3://bucket/key.pdf",
}
# 1. Verify it validates against the Pydantic model directly
model = DocumentIngestedEventData(**valid_data)
assert model.doc_id == valid_data["doc_id"]
# 2. Verify it validates using the shared validation utility
validated_model = validate_event_data(EventTopics.DOC_INGESTED, valid_data)
assert isinstance(validated_model, DocumentIngestedEventData)
assert validated_model.doc_id == valid_data["doc_id"]
# 3. Verify invalid data fails
invalid_data = valid_data.copy()
del invalid_data["doc_id"]
with pytest.raises(ValueError):
validate_event_data(EventTopics.DOC_INGESTED, invalid_data)

View File

@@ -0,0 +1,98 @@
import asyncio
import pytest
from libs.events.base import EventPayload
from libs.events.nats_bus import NATSEventBus
from libs.schemas.events import DocumentIngestedEventData
@pytest.mark.asyncio
async def test_nats_bus_class():
"""Test NATSEventBus class within pytest."""
import time
unique_suffix = int(time.time())
stream_name = f"PYTEST_DEBUG_STREAM_{unique_suffix}"
print(f"\nStarting NATSEventBus with stream {stream_name}...")
bus = NATSEventBus(
servers="nats://localhost:4222",
stream_name=stream_name,
consumer_group="test-debug-group",
)
await bus.start()
print("Bus started.")
# Clean up (just in case)
try:
await bus.js.delete_stream(stream_name)
except Exception:
pass
await bus._ensure_stream_exists()
# Wait for stream to be ready
await asyncio.sleep(2)
try:
info = await bus.js.stream_info(stream_name)
print(f"Stream info: {info.config.subjects}")
except Exception as e:
print(f"Failed to get stream info: {e}")
# Setup subscriber
received_event = asyncio.Future()
async def handler(topic, event):
print(f"Handler received event: {event.event_id}")
if not received_event.done():
received_event.set_result(event)
await bus.subscribe("doc.ingested", handler)
print("Publishing message...")
data = DocumentIngestedEventData(
doc_id="test-doc-123",
filename="test.pdf",
mime_type="application/pdf",
size_bytes=1024,
source="upload",
kind="invoice",
storage_path="s3://test-bucket/test.pdf",
checksum_sha256="a" * 64,
)
payload = EventPayload(
data=data.model_dump(mode="json"),
actor="tester",
tenant_id="tenant-1",
schema_version="1.0",
)
payload.event_id = "evt-debug-1"
success = await bus.publish("doc.ingested", payload)
print(f"Published: {success}")
try:
result = await asyncio.wait_for(received_event, timeout=5.0)
print(f"Received event: {result.event_id}")
assert result.event_id == "evt-debug-1"
assert result.data["doc_id"] == "test-doc-123"
except TimeoutError:
print("Timeout waiting for event")
raise
await bus.stop()
print("Bus stopped.")
# Cleanup stream
try:
nc = await nats.connect("nats://localhost:4222")
js = nc.jetstream()
await js.delete_stream(stream_name)
await nc.close()
except Exception:
pass

View File

@@ -0,0 +1,240 @@
import asyncio
import json
import pytest
import pytest_asyncio
from libs.events.base import EventPayload
from libs.events.nats_bus import NATSEventBus
from libs.schemas.events import DocumentIngestedEventData
# Check if NATS is available
async def is_nats_available():
import nats
try:
nc = await nats.connect("nats://localhost:4222")
await nc.close()
return True
except Exception:
return False
@pytest_asyncio.fixture
async def nats_bus():
"""Create and start a NATS event bus for testing."""
if not await is_nats_available():
pytest.skip("NATS server not available at localhost:4222")
bus = NATSEventBus(
servers="nats://localhost:4222",
stream_name="TEST_INTEGRATION_STREAM",
consumer_group="test-integration-group",
dlq_stream_name="TEST_INTEGRATION_DLQ",
max_retries=2,
)
await bus.start()
# Clean up streams before test
try:
await bus.js.delete_stream("TEST_INTEGRATION_STREAM")
await bus.js.delete_stream("TEST_INTEGRATION_DLQ")
except Exception:
pass
# Re-create streams
await bus._ensure_stream_exists()
await bus.dlq.ensure_dlq_stream_exists()
# Allow time for streams to propagate
await asyncio.sleep(2)
yield bus
# Clean up after test
try:
await bus.js.delete_stream("TEST_INTEGRATION_STREAM")
await bus.js.delete_stream("TEST_INTEGRATION_DLQ")
except Exception:
pass
await bus.stop()
@pytest.mark.integration
@pytest.mark.asyncio
async def test_publish_subscribe_flow():
"""Test end-to-end publish and subscribe flow."""
# Instantiate bus directly to debug fixture issues
bus = NATSEventBus(
servers="nats://localhost:4222",
stream_name="TEST_INTEGRATION_STREAM_DIRECT",
consumer_group="test-integration-group-direct",
dlq_stream_name="TEST_INTEGRATION_DLQ_DIRECT",
max_retries=2,
)
await bus.start()
try:
await bus.js.delete_stream("TEST_INTEGRATION_STREAM_DIRECT")
except Exception:
pass
await bus._ensure_stream_exists()
try:
# Create event data
data = DocumentIngestedEventData(
doc_id="test-doc-123",
filename="test.pdf",
mime_type="application/pdf",
size_bytes=1024,
source="upload",
kind="invoice",
storage_path="s3://test-bucket/test.pdf",
checksum_sha256="a" * 64,
)
payload = EventPayload(
data=data.model_dump(mode="json"),
actor="test-user",
tenant_id="test-tenant",
trace_id="trace-123",
schema_version="1.0",
)
payload.event_id = "evt-123"
# Setup subscriber
received_event = asyncio.Future()
async def handler(topic, event):
if not received_event.done():
received_event.set_result(event)
await bus.subscribe("doc.ingested", handler)
# Publish event
success = await bus.publish("doc.ingested", payload)
assert success is True
# Wait for reception
try:
result = await asyncio.wait_for(received_event, timeout=5.0)
assert result.event_id == payload.event_id
assert result.data["doc_id"] == "test-doc-123"
except TimeoutError:
pytest.fail("Event not received within timeout")
finally:
await bus.stop()
@pytest.mark.integration
@pytest.mark.asyncio
async def test_dlq_routing(nats_bus):
"""Test that failed events are routed to DLQ after retries."""
# Create event data
data = DocumentIngestedEventData(
doc_id="test-doc-fail",
filename="fail.pdf",
mime_type="application/pdf",
size_bytes=1024,
source="upload",
kind="invoice",
storage_path="s3://test-bucket/fail.pdf",
checksum_sha256="a" * 64,
)
payload = EventPayload(
data=data.model_dump(mode="json"),
actor="test-user",
tenant_id="test-tenant",
trace_id="trace-fail",
schema_version="1.0",
)
# Setup failing handler
failure_count = 0
async def failing_handler(topic, event):
nonlocal failure_count
failure_count += 1
raise ValueError("Simulated processing failure")
await nats_bus.subscribe("doc.fail", failing_handler)
# Publish event
await nats_bus.publish("doc.fail", payload)
# Wait for retries and DLQ routing
await asyncio.sleep(2.0) # Wait for processing
assert failure_count >= 2
# Consume from DLQ to verify
dlq_sub = await nats_bus.js.pull_subscribe(
subject="TEST_INTEGRATION_DLQ.doc.fail", durable="test-dlq-consumer"
)
msgs = await dlq_sub.fetch(batch=1, timeout=5.0)
assert len(msgs) == 1
dlq_msg = msgs[0]
dlq_data = json.loads(dlq_msg.data.decode())
assert dlq_data["original_payload"]["event_id"] == payload.event_id
assert dlq_data["error"]["type"] == "ValueError"
assert dlq_data["error"]["message"] == "Simulated processing failure"
await dlq_msg.ack()
@pytest.mark.integration
@pytest.mark.asyncio
async def test_metrics_recording(nats_bus):
"""Test that metrics are recorded during event processing."""
from libs.events.metrics import event_consumed_total, event_published_total
# Get initial values
initial_published = event_published_total.labels(topic="doc.metrics")._value.get()
initial_consumed = event_consumed_total.labels(
topic="doc.metrics", consumer_group="test-integration-group"
)._value.get()
# Create and publish event
data = DocumentIngestedEventData(
doc_id="test-doc-metrics",
filename="metrics.pdf",
mime_type="application/pdf",
size_bytes=1024,
source="upload",
kind="invoice",
storage_path="s3://test-bucket/metrics.pdf",
checksum_sha256="a" * 64,
)
payload = EventPayload(
data=data.model_dump(mode="json"),
actor="test-user",
tenant_id="test-tenant",
trace_id="trace-metrics",
schema_version="1.0",
)
received_event = asyncio.Future()
async def handler(topic, event):
if not received_event.done():
received_event.set_result(event)
await nats_bus.subscribe("doc.metrics", handler)
await nats_bus.publish("doc.metrics", payload)
await asyncio.wait_for(received_event, timeout=5.0)
# Check metrics increased
final_published = event_published_total.labels(topic="doc.metrics")._value.get()
final_consumed = event_consumed_total.labels(
topic="doc.metrics", consumer_group="test-integration-group"
)._value.get()
assert final_published > initial_published
assert final_consumed > initial_consumed

317
tests/unit/test_dlq.py Normal file
View File

@@ -0,0 +1,317 @@
"""Tests for Dead Letter Queue (DLQ) handler."""
import json
from unittest.mock import AsyncMock, patch
import pytest
from libs.events.base import EventPayload
from libs.events.dlq import DLQHandler, DLQMetrics
@pytest.fixture
def event_payload():
"""Create a test event payload."""
return EventPayload(
data={"test": "data", "value": 123},
actor="test-user",
tenant_id="test-tenant",
trace_id="test-trace-123",
schema_version="1.0",
)
@pytest.fixture
def mock_js():
"""Create a mock JetStream context."""
js = AsyncMock()
js.stream_info = AsyncMock()
js.add_stream = AsyncMock()
js.publish = AsyncMock()
return js
class TestDLQHandler:
"""Test cases for DLQ handler."""
@pytest.mark.asyncio
async def test_initialization(self, mock_js):
"""Test DLQ handler initialization."""
handler = DLQHandler(
js=mock_js,
dlq_stream_name="TEST_DLQ",
max_retries=5,
backoff_base_ms=500,
)
assert handler.js == mock_js
assert handler.dlq_stream_name == "TEST_DLQ"
assert handler.max_retries == 5
assert handler.backoff_base_ms == 500
@pytest.mark.asyncio
async def test_ensure_dlq_stream_exists_already_exists(self, mock_js):
"""Test ensuring DLQ stream when it already exists."""
mock_js.stream_info.return_value = {"name": "TEST_DLQ"}
handler = DLQHandler(js=mock_js, dlq_stream_name="TEST_DLQ")
await handler.ensure_dlq_stream_exists()
mock_js.stream_info.assert_called_once_with("TEST_DLQ")
mock_js.add_stream.assert_not_called()
@pytest.mark.asyncio
async def test_ensure_dlq_stream_creates_stream(self, mock_js):
"""Test ensuring DLQ stream when it doesn't exist."""
from nats.js.errors import NotFoundError
mock_js.stream_info.side_effect = NotFoundError
mock_js.add_stream = AsyncMock()
handler = DLQHandler(js=mock_js, dlq_stream_name="TEST_DLQ")
await handler.ensure_dlq_stream_exists()
mock_js.add_stream.assert_called_once()
call_kwargs = mock_js.add_stream.call_args[1]
assert call_kwargs["name"] == "TEST_DLQ"
assert call_kwargs["subjects"] == ["TEST_DLQ.*"]
@pytest.mark.asyncio
async def test_send_to_dlq(self, mock_js, event_payload):
"""Test sending event to DLQ."""
handler = DLQHandler(js=mock_js)
error = ValueError("Test error message")
await handler.send_to_dlq(
topic="test-topic",
payload=event_payload,
error=error,
retry_count=3,
)
mock_js.publish.assert_called_once()
call_kwargs = mock_js.publish.call_args[1]
# Verify subject
assert call_kwargs["subject"] == "TAX_AGENT_DLQ.test-topic"
# Verify payload content
payload_data = json.loads(call_kwargs["payload"].decode())
assert payload_data["original_topic"] == "test-topic"
assert payload_data["retry_count"] == 3
assert payload_data["error"]["type"] == "ValueError"
assert payload_data["error"]["message"] == "Test error message"
# Verify headers
headers = call_kwargs["headers"]
assert headers["original_topic"] == "test-topic"
assert headers["event_id"] == event_payload.event_id
assert headers["error_type"] == "ValueError"
@pytest.mark.asyncio
async def test_send_to_dlq_with_original_message(self, mock_js, event_payload):
"""Test sending event to DLQ with original message data."""
handler = DLQHandler(js=mock_js)
original_message = b'{"test": "original"}'
error = RuntimeError("Processing failed")
await handler.send_to_dlq(
topic="test-topic",
payload=event_payload,
error=error,
retry_count=2,
original_message_data=original_message,
)
call_kwargs = mock_js.publish.call_args[1]
payload_data = json.loads(call_kwargs["payload"].decode())
assert "original_message_data" in payload_data
assert payload_data["original_message_data"] == '{"test": "original"}'
@pytest.mark.asyncio
async def test_send_to_dlq_handles_publish_failure(self, mock_js, event_payload):
"""Test DLQ handler when DLQ publish fails."""
mock_js.publish.side_effect = Exception("DLQ publish failed")
handler = DLQHandler(js=mock_js)
# Should not raise, but log critical error
await handler.send_to_dlq(
topic="test-topic",
payload=event_payload,
error=ValueError("Original error"),
retry_count=1,
)
# Verify publish was attempted
mock_js.publish.assert_called_once()
def test_calculate_backoff(self, mock_js):
"""Test exponential backoff calculation."""
handler = DLQHandler(
js=mock_js,
backoff_base_ms=1000,
backoff_multiplier=2.0,
backoff_max_ms=10000,
)
# First retry: 1000ms * 2^0 = 1000ms = 1s
assert handler.calculate_backoff(0) == 1.0
# Second retry: 1000ms * 2^1 = 2000ms = 2s
assert handler.calculate_backoff(1) == 2.0
# Third retry: 1000ms * 2^2 = 4000ms = 4s
assert handler.calculate_backoff(2) == 4.0
# Fourth retry: 1000ms * 2^3 = 8000ms = 8s
assert handler.calculate_backoff(3) == 8.0
# Fifth retry: would be 16000ms but capped at 10000ms = 10s
assert handler.calculate_backoff(4) == 10.0
@pytest.mark.asyncio
async def test_retry_with_backoff_success_first_attempt(self, mock_js):
"""Test successful operation on first attempt."""
handler = DLQHandler(js=mock_js, max_retries=3)
async def successful_func():
return "success"
success, error = await handler.retry_with_backoff(successful_func)
assert success is True
assert error is None
@pytest.mark.asyncio
async def test_retry_with_backoff_success_after_retries(self, mock_js):
"""Test successful operation after retries."""
handler = DLQHandler(
js=mock_js,
max_retries=3,
backoff_base_ms=100, # Short backoff for testing
)
attempt_count = 0
async def flaky_func():
nonlocal attempt_count
attempt_count += 1
if attempt_count < 3:
raise ValueError(f"Fail attempt {attempt_count}")
return "success"
with patch("asyncio.sleep", new=AsyncMock()): # Speed up test
success, error = await handler.retry_with_backoff(flaky_func)
assert success is True
assert error is None
assert attempt_count == 3
@pytest.mark.asyncio
async def test_retry_with_backoff_all_attempts_fail(self, mock_js):
"""Test operation that fails all retry attempts."""
handler = DLQHandler(
js=mock_js,
max_retries=2,
backoff_base_ms=100,
)
async def always_fails():
raise ValueError("Always fails")
with patch("asyncio.sleep", new=AsyncMock()): # Speed up test
success, error = await handler.retry_with_backoff(always_fails)
assert success is False
assert isinstance(error, ValueError)
assert str(error) == "Always fails"
@pytest.mark.asyncio
async def test_retry_with_backoff_applies_delay(self, mock_js):
"""Test that retry applies backoff delay."""
handler = DLQHandler(
js=mock_js,
max_retries=2,
backoff_base_ms=1000,
backoff_multiplier=2.0,
)
attempt_count = 0
async def failing_func():
nonlocal attempt_count
attempt_count += 1
raise ValueError("Fail")
with patch("asyncio.sleep", new=AsyncMock()) as mock_sleep:
await handler.retry_with_backoff(failing_func)
# Should have called sleep twice (after 1st and 2nd failures)
assert mock_sleep.call_count == 2
# Verify backoff delays
calls = mock_sleep.call_args_list
assert calls[0][0][0] == 1.0 # First retry: 1s
assert calls[1][0][0] == 2.0 # Second retry: 2s
class TestDLQMetrics:
"""Test cases for DLQ metrics."""
def test_initialization(self):
"""Test metrics initialization."""
metrics = DLQMetrics()
assert metrics.total_dlq_events == 0
assert len(metrics.dlq_events_by_topic) == 0
assert len(metrics.dlq_events_by_error_type) == 0
def test_record_dlq_event(self):
"""Test recording DLQ events."""
metrics = DLQMetrics()
metrics.record_dlq_event("topic1", "ValueError")
metrics.record_dlq_event("topic1", "ValueError")
metrics.record_dlq_event("topic2", "RuntimeError")
assert metrics.total_dlq_events == 3
assert metrics.dlq_events_by_topic["topic1"] == 2
assert metrics.dlq_events_by_topic["topic2"] == 1
assert metrics.dlq_events_by_error_type["ValueError"] == 2
assert metrics.dlq_events_by_error_type["RuntimeError"] == 1
def test_get_metrics(self):
"""Test getting metrics snapshot."""
metrics = DLQMetrics()
metrics.record_dlq_event("topic1", "ValueError")
metrics.record_dlq_event("topic1", "RuntimeError")
snapshot = metrics.get_metrics()
assert snapshot["total_dlq_events"] == 2
assert snapshot["by_topic"]["topic1"] == 2
assert snapshot["by_error_type"]["ValueError"] == 1
assert snapshot["by_error_type"]["RuntimeError"] == 1
# Verify it's a copy, not a reference
snapshot["total_dlq_events"] = 999
assert metrics.total_dlq_events == 2
def test_reset(self):
"""Test resetting metrics."""
metrics = DLQMetrics()
metrics.record_dlq_event("topic1", "ValueError")
metrics.record_dlq_event("topic2", "RuntimeError")
assert metrics.total_dlq_events == 2
metrics.reset()
assert metrics.total_dlq_events == 0
assert len(metrics.dlq_events_by_topic) == 0
assert len(metrics.dlq_events_by_error_type) == 0

View File

@@ -0,0 +1,274 @@
"""Tests for event metrics."""
from unittest.mock import MagicMock, patch
from libs.events.metrics import (
EventMetricsCollector,
event_consumed_total,
event_dlq_total,
event_processing_duration_seconds,
event_processing_errors_total,
event_publish_errors_total,
event_published_total,
event_publishing_duration_seconds,
event_retry_total,
event_schema_validation_errors_total,
get_event_metrics_registry,
nats_consumer_lag_messages,
nats_stream_messages_total,
)
class TestEventMetrics:
"""Test cases for event metrics."""
def test_get_event_metrics_registry(self) -> None:
"""Test getting the metrics registry."""
registry = get_event_metrics_registry()
assert registry is not None
def test_metrics_exist(self) -> None:
"""Test that all expected metrics are defined."""
# Publishing metrics
assert event_published_total is not None
assert event_publish_errors_total is not None
assert event_publishing_duration_seconds is not None
# Consumption metrics
assert event_consumed_total is not None
assert event_processing_duration_seconds is not None
assert event_processing_errors_total is not None
# DLQ metrics
assert event_dlq_total is not None
assert event_retry_total is not None
# Schema validation metrics
assert event_schema_validation_errors_total is not None
# NATS metrics
assert nats_stream_messages_total is not None
assert nats_consumer_lag_messages is not None
class TestEventMetricsCollector:
"""Test cases for EventMetricsCollector."""
def test_record_publish_success(self) -> None:
"""Test recording successful publish."""
with patch.object(event_published_total, "labels") as mock_labels:
mock_counter = MagicMock()
mock_labels.return_value = mock_counter
EventMetricsCollector.record_publish(
topic="test.topic",
duration_seconds=0.05,
success=True,
)
mock_labels.assert_called_once_with(topic="test.topic")
mock_counter.inc.assert_called_once()
def test_record_publish_failure(self) -> None:
"""Test recording failed publish."""
with patch.object(event_publish_errors_total, "labels") as mock_labels:
mock_counter = MagicMock()
mock_labels.return_value = mock_counter
EventMetricsCollector.record_publish(
topic="test.topic",
duration_seconds=0.1,
success=False,
error_type="ConnectionError",
)
mock_labels.assert_called_once_with(
topic="test.topic", error_type="ConnectionError"
)
mock_counter.inc.assert_called_once()
def test_record_publish_duration(self) -> None:
"""Test recording publish duration."""
with patch.object(event_publishing_duration_seconds, "labels") as mock_labels:
mock_histogram = MagicMock()
mock_labels.return_value = mock_histogram
duration = 0.123
EventMetricsCollector.record_publish(
topic="test.topic",
duration_seconds=duration,
success=True,
)
mock_labels.assert_called_once_with(topic="test.topic")
mock_histogram.observe.assert_called_once_with(duration)
def test_record_consume_success(self) -> None:
"""Test recording successful event consumption."""
with patch.object(event_consumed_total, "labels") as mock_labels:
mock_counter = MagicMock()
mock_labels.return_value = mock_counter
EventMetricsCollector.record_consume(
topic="test.topic",
consumer_group="test-group",
duration_seconds=0.5,
success=True,
)
mock_labels.assert_called_once_with(
topic="test.topic", consumer_group="test-group"
)
mock_counter.inc.assert_called_once()
def test_record_consume_failure(self) -> None:
"""Test recording failed event consumption."""
with patch.object(event_processing_errors_total, "labels") as mock_labels:
mock_counter = MagicMock()
mock_labels.return_value = mock_counter
EventMetricsCollector.record_consume(
topic="test.topic",
consumer_group="test-group",
duration_seconds=1.0,
success=False,
error_type="ValidationError",
)
mock_labels.assert_called_once_with(
topic="test.topic",
consumer_group="test-group",
error_type="ValidationError",
)
mock_counter.inc.assert_called_once()
def test_record_consume_duration(self) -> None:
"""Test recording consumption duration."""
with patch.object(event_processing_duration_seconds, "labels") as mock_labels:
mock_histogram = MagicMock()
mock_labels.return_value = mock_histogram
duration = 2.5
EventMetricsCollector.record_consume(
topic="test.topic",
consumer_group="test-group",
duration_seconds=duration,
success=True,
)
mock_labels.assert_called_once_with(
topic="test.topic", consumer_group="test-group"
)
mock_histogram.observe.assert_called_once_with(duration)
def test_record_dlq(self) -> None:
"""Test recording DLQ event."""
with patch.object(event_dlq_total, "labels") as mock_labels:
mock_counter = MagicMock()
mock_labels.return_value = mock_counter
EventMetricsCollector.record_dlq(
topic="test.topic", error_type="TimeoutError"
)
mock_labels.assert_called_once_with(
topic="test.topic", error_type="TimeoutError"
)
mock_counter.inc.assert_called_once()
def test_record_retry(self) -> None:
"""Test recording retry attempt."""
with patch.object(event_retry_total, "labels") as mock_labels:
mock_counter = MagicMock()
mock_labels.return_value = mock_counter
EventMetricsCollector.record_retry(topic="test.topic", retry_attempt=2)
mock_labels.assert_called_once_with(topic="test.topic", retry_attempt="2")
mock_counter.inc.assert_called_once()
def test_record_schema_validation_error(self) -> None:
"""Test recording schema validation error."""
with patch.object(
event_schema_validation_errors_total, "labels"
) as mock_labels:
mock_counter = MagicMock()
mock_labels.return_value = mock_counter
EventMetricsCollector.record_schema_validation_error(
topic="test.topic", validation_error="missing_required_field"
)
mock_labels.assert_called_once_with(
topic="test.topic", validation_error="missing_required_field"
)
mock_counter.inc.assert_called_once()
def test_record_nats_stream_message(self) -> None:
"""Test recording NATS stream message."""
with patch.object(nats_stream_messages_total, "labels") as mock_labels:
mock_counter = MagicMock()
mock_labels.return_value = mock_counter
EventMetricsCollector.record_nats_stream_message(
stream_name="TAX_AGENT_EVENTS"
)
mock_labels.assert_called_once_with(stream_name="TAX_AGENT_EVENTS")
mock_counter.inc.assert_called_once()
def test_record_consumer_lag(self) -> None:
"""Test recording consumer lag."""
with patch.object(nats_consumer_lag_messages, "labels") as mock_labels:
mock_histogram = MagicMock()
mock_labels.return_value = mock_histogram
EventMetricsCollector.record_consumer_lag(
stream_name="TAX_AGENT_EVENTS",
consumer_group="tax-agent",
lag_messages=150,
)
mock_labels.assert_called_once_with(
stream_name="TAX_AGENT_EVENTS", consumer_group="tax-agent"
)
mock_histogram.observe.assert_called_once_with(150)
def test_record_publish_with_default_error_type(self) -> None:
"""Test recording publish failure with default error type."""
with patch.object(event_publish_errors_total, "labels") as mock_labels:
mock_counter = MagicMock()
mock_labels.return_value = mock_counter
EventMetricsCollector.record_publish(
topic="test.topic",
duration_seconds=0.1,
success=False,
error_type=None, # No error type provided
)
mock_labels.assert_called_once_with(
topic="test.topic", error_type="unknown" # Should default to "unknown"
)
mock_counter.inc.assert_called_once()
def test_record_consume_with_default_error_type(self) -> None:
"""Test recording consume failure with default error type."""
with patch.object(event_processing_errors_total, "labels") as mock_labels:
mock_counter = MagicMock()
mock_labels.return_value = mock_counter
EventMetricsCollector.record_consume(
topic="test.topic",
consumer_group="test-group",
duration_seconds=1.0,
success=False,
error_type=None, # No error type provided
)
mock_labels.assert_called_once_with(
topic="test.topic",
consumer_group="test-group",
error_type="unknown", # Should default to "unknown"
)
mock_counter.inc.assert_called_once()

View File

@@ -0,0 +1,500 @@
"""Tests for event schema validation."""
import pytest
from pydantic import ValidationError
from libs.events.topics import EventTopics
from libs.schemas.events import (
EVENT_SCHEMA_MAP,
CalculationReadyEventData,
DocumentExtractedEventData,
DocumentIngestedEventData,
DocumentOCRReadyEventData,
FirmSyncCompletedEventData,
FormFilledEventData,
HMRCSubmittedEventData,
KGUpsertedEventData,
KGUpsertReadyEventData,
RAGIndexedEventData,
ReviewCompletedEventData,
ReviewRequestedEventData,
get_schema_for_topic,
validate_event_data,
)
class TestDocumentIngestedEventData:
"""Test DocumentIngestedEventData schema."""
def test_valid_event(self) -> None:
"""Test creating a valid document ingested event."""
data = DocumentIngestedEventData(
doc_id="01H8Y9Z5M3K7N2P4Q6R8T0V1W3",
filename="invoice_2024.pdf",
mime_type="application/pdf",
size_bytes=102400,
checksum_sha256="a" * 64,
kind="invoice",
source="manual_upload",
storage_path="raw-documents/2024/invoice_2024.pdf",
)
assert data.doc_id == "01H8Y9Z5M3K7N2P4Q6R8T0V1W3"
assert data.size_bytes == 102400
assert len(data.checksum_sha256) == 64
def test_invalid_checksum(self) -> None:
"""Test invalid SHA-256 checksum."""
with pytest.raises(ValidationError) as exc_info:
DocumentIngestedEventData(
doc_id="01H8Y9Z5M3K7N2P4Q6R8T0V1W3",
filename="test.pdf",
mime_type="application/pdf",
size_bytes=1024,
checksum_sha256="invalid", # Too short
kind="invoice",
source="manual_upload",
storage_path="path/to/file",
)
assert "Invalid SHA-256 checksum format" in str(exc_info.value)
def test_negative_size(self) -> None:
"""Test negative file size validation."""
with pytest.raises(ValidationError):
DocumentIngestedEventData(
doc_id="01H8Y9Z5M3K7N2P4Q6R8T0V1W3",
filename="test.pdf",
mime_type="application/pdf",
size_bytes=-1, # Negative size
checksum_sha256="a" * 64,
kind="invoice",
source="manual_upload",
storage_path="path/to/file",
)
def test_immutable(self) -> None:
"""Test that event data is immutable."""
data = DocumentIngestedEventData(
doc_id="01H8Y9Z5M3K7N2P4Q6R8T0V1W3",
filename="test.pdf",
mime_type="application/pdf",
size_bytes=1024,
checksum_sha256="a" * 64,
kind="invoice",
source="manual_upload",
storage_path="path/to/file",
)
with pytest.raises(ValidationError):
data.filename = "changed.pdf" # Should raise because frozen=True
class TestDocumentOCRReadyEventData:
"""Test DocumentOCRReadyEventData schema."""
def test_valid_event(self) -> None:
"""Test creating a valid OCR ready event."""
data = DocumentOCRReadyEventData(
doc_id="01H8Y9Z5M3K7N2P4Q6R8T0V1W3",
ocr_engine="tesseract",
page_count=3,
confidence_avg=0.95,
text_length=5000,
layout_detected=True,
languages_detected=["en"],
processing_time_ms=1500,
storage_path="ocr-results/doc_123.json",
)
assert data.ocr_engine == "tesseract"
assert data.confidence_avg == 0.95
assert 0.0 <= data.confidence_avg <= 1.0
def test_invalid_confidence(self) -> None:
"""Test invalid confidence score."""
with pytest.raises(ValidationError):
DocumentOCRReadyEventData(
doc_id="123",
ocr_engine="tesseract",
page_count=1,
confidence_avg=1.5, # > 1.0
text_length=100,
layout_detected=True,
processing_time_ms=1000,
storage_path="path",
)
def test_invalid_ocr_engine(self) -> None:
"""Test invalid OCR engine value."""
with pytest.raises(ValidationError):
DocumentOCRReadyEventData(
doc_id="123",
ocr_engine="invalid_engine", # Not in allowed values
page_count=1,
confidence_avg=0.9,
text_length=100,
layout_detected=True,
processing_time_ms=1000,
storage_path="path",
)
class TestDocumentExtractedEventData:
"""Test DocumentExtractedEventData schema."""
def test_valid_event(self) -> None:
"""Test creating a valid extraction event."""
data = DocumentExtractedEventData(
doc_id="01H8Y9Z5M3K7N2P4Q6R8T0V1W3",
extraction_id="extr_123",
strategy="hybrid",
fields_extracted=15,
confidence_avg=0.88,
calibrated_confidence=0.91,
model_name="gpt-4",
processing_time_ms=3000,
storage_path="extractions/extr_123.json",
)
assert data.strategy == "hybrid"
assert data.model_name == "gpt-4"
def test_valid_without_model(self) -> None:
"""Test extraction event without model (rules-based)."""
data = DocumentExtractedEventData(
doc_id="123",
extraction_id="extr_456",
strategy="rules",
fields_extracted=10,
confidence_avg=0.95,
calibrated_confidence=0.93,
model_name=None, # No model for rules-based
processing_time_ms=500,
storage_path="path",
)
assert data.model_name is None
assert data.strategy == "rules"
class TestKGEvents:
"""Test Knowledge Graph event schemas."""
def test_kg_upsert_ready(self) -> None:
"""Test KG upsert ready event."""
data = KGUpsertReadyEventData(
doc_id="01H8Y9Z5M3K7N2P4Q6R8T0V1W3",
entity_count=25,
relationship_count=40,
tax_year="2024-25",
taxpayer_id="TP-001",
normalization_id="norm_123",
storage_path="normalized/norm_123.json",
)
assert data.entity_count == 25
assert data.tax_year == "2024-25"
def test_kg_upserted(self) -> None:
"""Test KG upserted event."""
data = KGUpsertedEventData(
doc_id="01H8Y9Z5M3K7N2P4Q6R8T0V1W3",
entities_created=10,
entities_updated=5,
relationships_created=20,
relationships_updated=10,
shacl_violations=0,
processing_time_ms=2000,
success=True,
error_message=None,
)
assert data.success is True
assert data.shacl_violations == 0
def test_kg_upserted_with_violations(self) -> None:
"""Test KG upserted event with SHACL violations."""
data = KGUpsertedEventData(
doc_id="123",
entities_created=5,
entities_updated=0,
relationships_created=8,
relationships_updated=0,
shacl_violations=3,
processing_time_ms=1500,
success=False,
error_message="SHACL validation failed: Missing required property",
)
assert data.success is False
assert data.shacl_violations == 3
assert data.error_message is not None
class TestRAGIndexedEventData:
"""Test RAG indexed event schema."""
def test_valid_event(self) -> None:
"""Test creating a valid RAG indexed event."""
data = RAGIndexedEventData(
doc_id="01H8Y9Z5M3K7N2P4Q6R8T0V1W3",
collection_name="firm_knowledge",
chunks_indexed=45,
embedding_model="bge-small-en-v1.5",
pii_detected=True,
pii_redacted=True,
processing_time_ms=5000,
storage_path="chunks/doc_123.json",
)
assert data.pii_detected is True
assert data.pii_redacted is True
assert data.chunks_indexed == 45
class TestCalculationReadyEventData:
"""Test calculation ready event schema."""
def test_valid_event(self) -> None:
"""Test creating a valid calculation event."""
data = CalculationReadyEventData(
taxpayer_id="TP-001",
tax_year="2024-25",
schedule_id="SA103",
calculation_id="calc_789",
boxes_computed=50,
total_income=85000.50,
total_tax=18500.25,
confidence=0.92,
evidence_count=15,
processing_time_ms=2500,
storage_path="calculations/calc_789.json",
)
assert data.schedule_id == "SA103"
assert data.total_income == 85000.50
assert data.total_tax == 18500.25
def test_valid_without_totals(self) -> None:
"""Test calculation event without totals (partial calculation)."""
data = CalculationReadyEventData(
taxpayer_id="TP-001",
tax_year="2024-25",
schedule_id="SA102",
calculation_id="calc_456",
boxes_computed=20,
total_income=None,
total_tax=None,
confidence=0.85,
evidence_count=10,
processing_time_ms=1000,
storage_path="calculations/calc_456.json",
)
assert data.total_income is None
assert data.total_tax is None
class TestFormFilledEventData:
"""Test form filled event schema."""
def test_valid_event(self) -> None:
"""Test creating a valid form filled event."""
data = FormFilledEventData(
taxpayer_id="TP-001",
tax_year="2024-25",
form_id="SA100",
fields_filled=75,
pdf_size_bytes=524288,
storage_path="forms/SA100_filled.pdf",
evidence_bundle_path="evidence/bundle_123.zip",
checksum_sha256="b" * 64,
)
assert data.form_id == "SA100"
assert data.evidence_bundle_path is not None
class TestHMRCSubmittedEventData:
"""Test HMRC submitted event schema."""
def test_successful_submission(self) -> None:
"""Test successful HMRC submission."""
data = HMRCSubmittedEventData(
taxpayer_id="TP-001",
tax_year="2024-25",
submission_id="sub_999",
hmrc_reference="HMRC-REF-12345",
submission_type="sandbox",
success=True,
status_code=200,
error_message=None,
processing_time_ms=3000,
)
assert data.success is True
assert data.hmrc_reference is not None
def test_failed_submission(self) -> None:
"""Test failed HMRC submission."""
data = HMRCSubmittedEventData(
taxpayer_id="TP-001",
tax_year="2024-25",
submission_id="sub_888",
hmrc_reference=None,
submission_type="live",
success=False,
status_code=400,
error_message="Invalid UTR number",
processing_time_ms=1500,
)
assert data.success is False
assert data.error_message is not None
def test_invalid_submission_type(self) -> None:
"""Test invalid submission type."""
with pytest.raises(ValidationError):
HMRCSubmittedEventData(
taxpayer_id="TP-001",
tax_year="2024-25",
submission_id="sub_777",
hmrc_reference=None,
submission_type="invalid", # Not in allowed values
success=False,
status_code=None,
error_message=None,
processing_time_ms=1000,
)
class TestReviewEvents:
"""Test review event schemas."""
def test_review_requested(self) -> None:
"""Test review requested event."""
data = ReviewRequestedEventData(
doc_id="01H8Y9Z5M3K7N2P4Q6R8T0V1W3",
review_type="extraction",
priority="high",
reason="Low confidence extraction (0.65)",
assigned_to="reviewer@example.com",
due_date="2024-12-01T10:00:00Z",
metadata={"extraction_id": "extr_123"},
)
assert data.priority == "high"
assert data.review_type == "extraction"
def test_review_completed(self) -> None:
"""Test review completed event."""
data = ReviewCompletedEventData(
doc_id="01H8Y9Z5M3K7N2P4Q6R8T0V1W3",
review_id="rev_456",
reviewer="reviewer@example.com",
decision="approved",
changes_made=3,
comments="Fixed vendor name and amount",
review_duration_seconds=180,
)
assert data.decision == "approved"
assert data.changes_made == 3
class TestFirmSyncCompletedEventData:
"""Test firm sync completed event schema."""
def test_successful_sync(self) -> None:
"""Test successful firm sync."""
data = FirmSyncCompletedEventData(
firm_id="FIRM-001",
connector_type="xero",
sync_id="sync_123",
records_synced=150,
records_created=50,
records_updated=100,
records_failed=0,
success=True,
error_message=None,
processing_time_ms=10000,
)
assert data.success is True
assert data.records_failed == 0
def test_partial_sync_failure(self) -> None:
"""Test sync with some failures."""
data = FirmSyncCompletedEventData(
firm_id="FIRM-002",
connector_type="sage",
sync_id="sync_456",
records_synced=90,
records_created=30,
records_updated=60,
records_failed=10,
success=True, # Overall success despite some failures
error_message="10 records failed validation",
processing_time_ms=15000,
)
assert data.records_failed == 10
assert data.error_message is not None
class TestSchemaMapping:
"""Test schema mapping and validation utilities."""
def test_all_topics_have_schemas(self) -> None:
"""Test that all topics in EventTopics have corresponding schemas."""
topic_values = {
getattr(EventTopics, attr)
for attr in dir(EventTopics)
if not attr.startswith("_")
}
schema_topics = set(EVENT_SCHEMA_MAP.keys())
# All event topics should have schemas
missing_schemas = topic_values - schema_topics
assert not missing_schemas, f"Missing schemas for topics: {missing_schemas}"
def test_validate_event_data(self) -> None:
"""Test validate_event_data function."""
valid_data = {
"doc_id": "01H8Y9Z5M3K7N2P4Q6R8T0V1W3",
"filename": "test.pdf",
"mime_type": "application/pdf",
"size_bytes": 1024,
"checksum_sha256": "a" * 64,
"kind": "invoice",
"source": "manual_upload",
"storage_path": "path/to/file",
}
result = validate_event_data("doc.ingested", valid_data)
assert isinstance(result, DocumentIngestedEventData)
assert result.doc_id == "01H8Y9Z5M3K7N2P4Q6R8T0V1W3"
def test_validate_unknown_topic(self) -> None:
"""Test validation with unknown topic."""
with pytest.raises(ValueError, match="Unknown event topic"):
validate_event_data("unknown.topic", {})
def test_validate_invalid_data(self) -> None:
"""Test validation with invalid data."""
invalid_data = {
"doc_id": "123",
"filename": "test.pdf",
# Missing required fields
}
with pytest.raises(ValidationError):
validate_event_data("doc.ingested", invalid_data)
def test_get_schema_for_topic(self) -> None:
"""Test get_schema_for_topic function."""
schema = get_schema_for_topic("doc.ingested")
assert schema == DocumentIngestedEventData
def test_get_schema_unknown_topic(self) -> None:
"""Test get_schema_for_topic with unknown topic."""
with pytest.raises(ValueError, match="Unknown event topic"):
get_schema_for_topic("unknown.topic")
def test_schema_prevents_extra_fields(self) -> None:
"""Test that schemas prevent extra fields (extra='forbid')."""
with pytest.raises(ValidationError) as exc_info:
DocumentIngestedEventData(
doc_id="123",
filename="test.pdf",
mime_type="application/pdf",
size_bytes=1024,
checksum_sha256="a" * 64,
kind="invoice",
source="manual_upload",
storage_path="path",
unexpected_field="should_fail", # Extra field
)
assert "Extra inputs are not permitted" in str(exc_info.value)

View File

@@ -1,10 +1,10 @@
"""Tests for NATS event bus implementation."""
import asyncio
import json
from unittest.mock import AsyncMock, MagicMock, patch
import pytest
from nats.js.api import ConsumerConfig
from libs.events.base import EventPayload
from libs.events.nats_bus import NATSEventBus
@@ -41,9 +41,12 @@ class TestNATSEventBus:
assert nats_bus.servers == ["nats://localhost:4222"]
assert nats_bus.stream_name == "TEST_STREAM"
assert nats_bus.consumer_group == "test-group"
assert nats_bus.dlq_stream_name == "TAX_AGENT_DLQ"
assert nats_bus.max_retries == 3
assert not nats_bus.running
assert nats_bus.nc is None
assert nats_bus.js is None
assert nats_bus.dlq is None
@pytest.mark.asyncio
async def test_initialization_with_multiple_servers(self):
@@ -54,14 +57,21 @@ class TestNATSEventBus:
@pytest.mark.asyncio
@patch("libs.events.nats_bus.nats.connect")
async def test_start(self, mock_connect, nats_bus):
@patch("libs.events.nats_bus.DLQHandler")
async def test_start(self, mock_dlq_cls, mock_connect, nats_bus):
"""Test starting the NATS event bus."""
# Mock NATS connection and JetStream
mock_nc = AsyncMock()
mock_js = AsyncMock()
mock_nc.jetstream.return_value = mock_js
# jetstream() is synchronous, so we mock it as a MagicMock or just set return value
mock_nc.jetstream = MagicMock(return_value=mock_js)
mock_connect.return_value = mock_nc
# Mock DLQ handler
mock_dlq_instance = MagicMock()
mock_dlq_instance.ensure_dlq_stream_exists = AsyncMock()
mock_dlq_cls.return_value = mock_dlq_instance
# Mock stream info to simulate existing stream
mock_js.stream_info.return_value = {"name": "TEST_STREAM"}
@@ -70,26 +80,40 @@ class TestNATSEventBus:
assert nats_bus.running
assert nats_bus.nc == mock_nc
assert nats_bus.js == mock_js
assert nats_bus.dlq == mock_dlq_instance
mock_connect.assert_called_once_with(servers=["nats://localhost:4222"])
mock_dlq_instance.ensure_dlq_stream_exists.assert_called_once()
@pytest.mark.asyncio
@patch("libs.events.nats_bus.nats.connect")
async def test_start_creates_stream_if_not_exists(self, mock_connect, nats_bus):
@patch("libs.events.nats_bus.DLQHandler")
async def test_start_creates_stream_if_not_exists(
self, mock_dlq_cls, mock_connect, nats_bus
):
"""Test that start creates stream if it doesn't exist."""
# Mock NATS connection and JetStream
mock_nc = AsyncMock()
mock_js = AsyncMock()
mock_nc.jetstream.return_value = mock_js
mock_nc.jetstream = MagicMock(return_value=mock_js)
mock_connect.return_value = mock_nc
# Mock DLQ handler
mock_dlq_instance = MagicMock()
mock_dlq_instance.ensure_dlq_stream_exists = AsyncMock()
mock_dlq_cls.return_value = mock_dlq_instance
# Mock stream_info to raise NotFoundError, then add_stream
from nats.js.errors import NotFoundError
mock_js.stream_info.side_effect = NotFoundError
mock_js.add_stream = AsyncMock()
await nats_bus.start()
mock_js.add_stream.assert_called_once()
call_args = mock_js.add_stream.call_args
assert call_args[1]["subjects"] == ["TEST_STREAM.>"]
@pytest.mark.asyncio
async def test_start_already_running(self, nats_bus):
@@ -107,17 +131,22 @@ class TestNATSEventBus:
# Setup mock objects
mock_nc = AsyncMock()
mock_subscription = AsyncMock()
mock_task = AsyncMock()
# Create a real task for consumer_tasks
async def dummy_task():
pass
real_task = asyncio.create_task(dummy_task())
nats_bus.running = True
nats_bus.nc = mock_nc
nats_bus.subscriptions = {"test-topic": mock_subscription}
nats_bus.consumer_tasks = [mock_task]
nats_bus.consumer_tasks = [real_task]
await nats_bus.stop()
assert not nats_bus.running
mock_task.cancel.assert_called_once()
assert real_task.cancelled() or real_task.done()
mock_subscription.unsubscribe.assert_called_once()
mock_nc.close.assert_called_once()
@@ -129,7 +158,8 @@ class TestNATSEventBus:
assert not nats_bus.running
@pytest.mark.asyncio
async def test_publish(self, nats_bus, event_payload):
@patch("libs.events.nats_bus.EventMetricsCollector")
async def test_publish(self, mock_metrics, nats_bus, event_payload):
"""Test publishing an event."""
# Setup mock JetStream
mock_js = AsyncMock()
@@ -146,6 +176,10 @@ class TestNATSEventBus:
assert call_args[1]["subject"] == "TEST_STREAM.test-topic"
assert call_args[1]["payload"] == event_payload.to_json().encode()
# Verify metrics recorded
mock_metrics.record_publish.assert_called_once()
assert mock_metrics.record_publish.call_args[1]["success"] is True
@pytest.mark.asyncio
async def test_publish_not_started(self, nats_bus, event_payload):
"""Test publishing when event bus is not started."""
@@ -153,7 +187,8 @@ class TestNATSEventBus:
await nats_bus.publish("test-topic", event_payload)
@pytest.mark.asyncio
async def test_publish_failure(self, nats_bus, event_payload):
@patch("libs.events.nats_bus.EventMetricsCollector")
async def test_publish_failure(self, mock_metrics, nats_bus, event_payload):
"""Test publishing failure."""
# Setup mock JetStream that raises exception
mock_js = AsyncMock()
@@ -164,6 +199,10 @@ class TestNATSEventBus:
assert result is False
# Verify metrics recorded failure
mock_metrics.record_publish.assert_called_once()
assert mock_metrics.record_publish.call_args[1]["success"] is False
@pytest.mark.asyncio
async def test_subscribe(self, nats_bus):
"""Test subscribing to a topic."""
@@ -184,11 +223,19 @@ class TestNATSEventBus:
assert test_handler in nats_bus.handlers["test-topic"]
assert "test-topic" in nats_bus.subscriptions
mock_js.pull_subscribe.assert_called_once()
# Verify ConsumerConfig
call_kwargs = mock_js.pull_subscribe.call_args[1]
config = call_kwargs["config"]
assert isinstance(config, ConsumerConfig)
assert config.max_deliver == 5 # 3 retries + 2 buffer
mock_create_task.assert_called_once()
@pytest.mark.asyncio
async def test_subscribe_not_started(self, nats_bus):
"""Test subscribing when event bus is not started."""
async def test_handler(topic: str, payload: EventPayload) -> None:
pass
@@ -220,7 +267,8 @@ class TestNATSEventBus:
assert handler2 in nats_bus.handlers["test-topic"]
@pytest.mark.asyncio
async def test_consume_messages(self, nats_bus, event_payload):
@patch("libs.events.nats_bus.EventMetricsCollector")
async def test_consume_messages(self, mock_metrics, nats_bus, event_payload):
"""Test consuming messages from NATS."""
# Setup mock subscription and message
mock_subscription = AsyncMock()
@@ -253,6 +301,10 @@ class TestNATSEventBus:
assert received_payload.event_id == event_payload.event_id
mock_message.ack.assert_called_once()
# Verify metrics
mock_metrics.record_consume.assert_called_once()
assert mock_metrics.record_consume.call_args[1]["success"] is True
@pytest.mark.asyncio
async def test_factory_integration(self):
"""Test that the factory can create a NATS event bus."""