Initial commit

2025-10-11 08:41:36 +01:00
commit b324ff09ef
276 changed files with 55220 additions and 0 deletions
--- a/apps/init.py
+++ b/apps/init.py
@@ -0,0 +1,4 @@
+# file: /Users/harris/Projects/ai-tax-agent/apps/__init__.py
+# hypothesis_version: 6.138.15
+
+[]
--- a/apps/svc_coverage/Dockerfile
+++ b/apps/svc_coverage/Dockerfile
@@ -0,0 +1,53 @@
+# Multi-stage build for svc-coverage
+FROM python:3.12-slim AS builder
+
+# Install build dependencies
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    curl \
+    && rm -rf /var/lib/apt/lists/*
+
+# Create virtual environment
+RUN python -m venv /opt/venv
+ENV PATH="/opt/venv/bin:$PATH"
+
+# Copy requirements and install dependencies
+COPY libs/requirements-base.txt /tmp/libs-requirements.txt
+COPY apps/svc_coverage/requirements.txt /tmp/requirements.txt
+RUN pip install --no-cache-dir --upgrade pip && \
+    pip install --no-cache-dir -r /tmp/libs-requirements.txt -r /tmp/requirements.txt
+
+# Production stage
+FROM python:3.12-slim
+
+# Install runtime dependencies
+RUN apt-get update && apt-get install -y \
+    curl \
+    && rm -rf /var/lib/apt/lists/* \
+    && groupadd -r appuser \
+    && useradd -r -g appuser appuser
+
+# Copy virtual environment from builder
+COPY --from=builder /opt/venv /opt/venv
+ENV PATH="/opt/venv/bin:$PATH"
+
+# Set working directory
+WORKDIR /app
+
+# Copy application code
+COPY libs/ ./libs/
+COPY apps/svc_coverage/ ./apps/svc_coverage/
+
+# Create non-root user and set permissions
+RUN chown -R appuser:appuser /app
+USER appuser
+
+# Health check
+HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
+    CMD curl -f http://localhost:8000/healthz || exit 1
+
+# Expose port
+EXPOSE 8000
+
+# Run the application
+CMD ["python", "-m", "uvicorn", "apps.svc_coverage.main:app", "--host", "0.0.0.0", "--port", "8000"]
--- a/apps/svc_coverage/init.py
+++ b/apps/svc_coverage/init.py
@@ -0,0 +1 @@
+"""Coverage service package."""
--- a/apps/svc_coverage/alembic.ini
+++ b/apps/svc_coverage/alembic.ini
@@ -0,0 +1,112 @@
+# A generic, single database configuration.
+
+[alembic]
+# path to migration scripts
+script_location = alembic
+
+# template used to generate migration file names; The default value is %%(rev)s_%%(slug)s
+# Uncomment the line below if you want the files to be prepended with date and time
+# file_template = %%(year)d_%%(month).2d_%%(day).2d_%%(hour).2d%%(minute).2d-%%(rev)s_%%(slug)s
+
+# sys.path path, will be prepended to sys.path if present.
+# defaults to the current working directory.
+prepend_sys_path = .
+
+# timezone to use when rendering the date within the migration file
+# as well as the filename.
+# If specified, requires the python-dateutil library that can be
+# installed by adding `alembic[tz]` to the pip requirements
+# string value is passed to dateutil.tz.gettz()
+# leave blank for localtime
+# timezone =
+
+# max length of characters to apply to the
+# "slug" field
+# truncate_slug_length = 40
+
+# set to 'true' to run the environment during
+# the 'revision' command, regardless of autogenerate
+# revision_environment = false
+
+# set to 'true' to allow .pyc and .pyo files without
+# a source .py file to be detected as revisions in the
+# versions/ directory
+# sourceless = false
+
+# version number format
+version_num_format = %04d
+
+# version path separator; As mentioned above, this is the character used to split
+# version_locations. The default within new alembic.ini files is "os", which uses
+# os.pathsep. If this key is omitted entirely, it falls back to the legacy
+# behavior of splitting on spaces and/or commas.
+# Valid values for version_path_separator are:
+#
+# version_path_separator = :
+# version_path_separator = ;
+# version_path_separator = space
+version_path_separator = os
+
+# set to 'true' to search source files recursively
+# in each "version_locations" directory
+# new in Alembic version 1.10
+# recursive_version_locations = false
+
+# the output encoding used when revision files
+# are written from script.py.mako
+# output_encoding = utf-8
+
+sqlalchemy.url = postgresql://user:pass@localhost:5432/coverage
+
+
+[post_write_hooks]
+# post_write_hooks defines scripts or Python functions that are run
+# on newly generated revision scripts.  See the documentation for further
+# detail and examples
+
+# format using "black" - use the console_scripts runner, against the "black" entrypoint
+# hooks = black
+# black.type = console_scripts
+# black.entrypoint = black
+# black.options = -l 79 REVISION_SCRIPT_FILENAME
+
+# lint with attempts to fix using "ruff" - use the exec runner, execute a binary
+# hooks = ruff
+# ruff.type = exec
+# ruff.executable = %(here)s/.venv/bin/ruff
+# ruff.options = --fix REVISION_SCRIPT_FILENAME
+
+# Logging configuration
+[loggers]
+keys = root,sqlalchemy,alembic
+
+[handlers]
+keys = console
+
+[formatters]
+keys = generic
+
+[logger_root]
+level = WARN
+handlers = console
+qualname =
+
+[logger_sqlalchemy]
+level = WARN
+handlers =
+qualname = sqlalchemy.engine
+
+[logger_alembic]
+level = INFO
+handlers =
+qualname = alembic
+
+[handler_console]
+class = StreamHandler
+args = (sys.stderr,)
+level = NOTSET
+formatter = generic
+
+[formatter_generic]
+format = %(levelname)-5.5s [%(name)s] %(message)s
+datefmt = %H:%M:%S
--- a/apps/svc_coverage/alembic/env.py
+++ b/apps/svc_coverage/alembic/env.py
@@ -0,0 +1,92 @@
+"""Alembic environment configuration for coverage service."""
+
+import os
+import sys
+from logging.config import fileConfig
+
+from alembic import context
+from sqlalchemy import engine_from_config, pool
+
+# Add the parent directory to the path so we can import our models
+sys.path.append(os.path.join(os.path.dirname(__file__), "..", "..", ".."))
+
+# Import your models here
+from apps.svc_coverage.models import Base
+
+# this is the Alembic Config object, which provides
+# access to the values within the .ini file in use.
+config = context.config
+
+# Interpret the config file for Python logging.
+# This line sets up loggers basically.
+if config.config_file_name is not None:
+    fileConfig(config.config_file_name)
+
+# add your model's MetaData object here
+# for 'autogenerate' support
+target_metadata = Base.metadata
+
+# other values from the config, defined by the needs of env.py,
+# can be acquired:
+# my_important_option = config.get_main_option("my_important_option")
+# ... etc.
+
+
+def get_url():
+    """Get database URL from environment or config."""
+    return os.getenv("DATABASE_URL", config.get_main_option("sqlalchemy.url"))
+
+
+def run_migrations_offline() -> None:
+    """Run migrations in 'offline' mode.
+
+    This configures the context with just a URL
+    and not an Engine, though an Engine is acceptable
+    here as well.  By skipping the Engine creation
+    we don't even need a DBAPI to be available.
+
+    Calls to context.execute() here emit the given string to the
+    script output.
+
+    """
+    url = get_url()
+    context.configure(
+        url=url,
+        target_metadata=target_metadata,
+        literal_binds=True,
+        dialect_opts={"paramstyle": "named"},
+    )
+
+    with context.begin_transaction():
+        context.run_migrations()
+
+
+def run_migrations_online() -> None:
+    """Run migrations in 'online' mode.
+
+    In this scenario we need to create an Engine
+    and associate a connection with the context.
+
+    """
+    configuration = config.get_section(config.config_ini_section)
+    configuration["sqlalchemy.url"] = get_url()
+    
+    connectable = engine_from_config(
+        configuration,
+        prefix="sqlalchemy.",
+        poolclass=pool.NullPool,
+    )
+
+    with connectable.connect() as connection:
+        context.configure(
+            connection=connection, target_metadata=target_metadata
+        )
+
+        with context.begin_transaction():
+            context.run_migrations()
+
+
+if context.is_offline_mode():
+    run_migrations_offline()
+else:
+    run_migrations_online()
--- a/apps/svc_coverage/alembic/script.py.mako
+++ b/apps/svc_coverage/alembic/script.py.mako
@@ -0,0 +1,24 @@
+"""${message}
+
+Revision ID: ${up_revision}
+Revises: ${down_revision | comma,n}
+Create Date: ${create_date}
+
+"""
+from alembic import op
+import sqlalchemy as sa
+${imports if imports else ""}
+
+# revision identifiers, used by Alembic.
+revision = ${repr(up_revision)}
+down_revision = ${repr(down_revision)}
+branch_labels = ${repr(branch_labels)}
+depends_on = ${repr(depends_on)}
+
+
+def upgrade() -> None:
+    ${upgrades if upgrades else "pass"}
+
+
+def downgrade() -> None:
+    ${downgrades if downgrades else "pass"}
--- a/apps/svc_coverage/alembic/versions/0001_initial_coverage_tables.py
+++ b/apps/svc_coverage/alembic/versions/0001_initial_coverage_tables.py
@@ -0,0 +1,76 @@
+"""Initial coverage tables
+
+Revision ID: 0001
+Revises: 
+Create Date: 2024-09-14 12:00:00.000000
+
+"""
+from alembic import op
+import sqlalchemy as sa
+from sqlalchemy.dialects import postgresql
+
+# revision identifiers, used by Alembic.
+revision = '0001'
+down_revision = None
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    # Create coverage_versions table
+    op.create_table(
+        'coverage_versions',
+        sa.Column('id', sa.Integer(), autoincrement=True, nullable=False),
+        sa.Column('version', sa.String(length=50), nullable=False),
+        sa.Column('jurisdiction', sa.String(length=10), nullable=False),
+        sa.Column('tax_year', sa.String(length=10), nullable=False),
+        sa.Column('tenant_id', sa.String(length=100), nullable=True),
+        sa.Column('source_files', postgresql.JSON(astext_type=sa.Text()), nullable=False),
+        sa.Column('compiled_at', sa.DateTime(), nullable=False),
+        sa.Column('hash', sa.String(length=64), nullable=False),
+        sa.PrimaryKeyConstraint('id')
+    )
+    
+    # Create indexes for coverage_versions
+    op.create_index('ix_coverage_versions_version', 'coverage_versions', ['version'])
+    op.create_index('ix_coverage_versions_jurisdiction_tax_year', 'coverage_versions', ['jurisdiction', 'tax_year'])
+    op.create_index('ix_coverage_versions_tenant_id', 'coverage_versions', ['tenant_id'])
+    op.create_index('ix_coverage_versions_hash', 'coverage_versions', ['hash'])
+    
+    # Create coverage_audit table
+    op.create_table(
+        'coverage_audit',
+        sa.Column('id', sa.Integer(), autoincrement=True, nullable=False),
+        sa.Column('taxpayer_id', sa.String(length=100), nullable=False),
+        sa.Column('tax_year', sa.String(length=10), nullable=False),
+        sa.Column('policy_version', sa.String(length=50), nullable=False),
+        sa.Column('overall_status', sa.String(length=20), nullable=False),
+        sa.Column('blocking_items', postgresql.JSON(astext_type=sa.Text()), nullable=False),
+        sa.Column('created_at', sa.DateTime(), nullable=False),
+        sa.Column('trace_id', sa.String(length=100), nullable=True),
+        sa.PrimaryKeyConstraint('id')
+    )
+    
+    # Create indexes for coverage_audit
+    op.create_index('ix_coverage_audit_taxpayer_id', 'coverage_audit', ['taxpayer_id'])
+    op.create_index('ix_coverage_audit_tax_year', 'coverage_audit', ['tax_year'])
+    op.create_index('ix_coverage_audit_taxpayer_tax_year', 'coverage_audit', ['taxpayer_id', 'tax_year'])
+    op.create_index('ix_coverage_audit_created_at', 'coverage_audit', ['created_at'])
+    op.create_index('ix_coverage_audit_trace_id', 'coverage_audit', ['trace_id'])
+
+
+def downgrade() -> None:
+    # Drop coverage_audit table and indexes
+    op.drop_index('ix_coverage_audit_trace_id', table_name='coverage_audit')
+    op.drop_index('ix_coverage_audit_created_at', table_name='coverage_audit')
+    op.drop_index('ix_coverage_audit_taxpayer_tax_year', table_name='coverage_audit')
+    op.drop_index('ix_coverage_audit_tax_year', table_name='coverage_audit')
+    op.drop_index('ix_coverage_audit_taxpayer_id', table_name='coverage_audit')
+    op.drop_table('coverage_audit')
+    
+    # Drop coverage_versions table and indexes
+    op.drop_index('ix_coverage_versions_hash', table_name='coverage_versions')
+    op.drop_index('ix_coverage_versions_tenant_id', table_name='coverage_versions')
+    op.drop_index('ix_coverage_versions_jurisdiction_tax_year', table_name='coverage_versions')
+    op.drop_index('ix_coverage_versions_version', table_name='coverage_versions')
+    op.drop_table('coverage_versions')
--- a/apps/svc_coverage/main.py
+++ b/apps/svc_coverage/main.py
@@ -0,0 +1,523 @@
+# FILE: apps/svc-coverage/main.py
+
+# Coverage policy service with evaluation, clarification, and hot reload
+
+import os
+import sys
+from typing import Any
+
+import structlog
+from fastapi import Depends, HTTPException
+from pydantic import BaseModel
+
+sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
+
+from libs.app_factory import create_app
+from libs.config import BaseAppSettings, create_event_bus, create_neo4j_client
+from libs.coverage import CoverageEvaluator
+from libs.events import EventBus
+from libs.neo import Neo4jClient
+from libs.observability import get_metrics, get_tracer, setup_observability
+from libs.policy import PolicyLoader, get_policy_loader
+from libs.schemas import (
+    ClarifyContext,
+    ClarifyResponse,
+    CoverageGap,
+    CoverageReport,
+    PolicyError,
+    UploadOption,
+    ValidationResult,
+)
+from libs.security import get_current_user, get_tenant_id
+
+logger = structlog.get_logger()
+
+
+async def http_exception_handler(_request, exc) -> dict[str, str | int]:
+    """Handle HTTP exceptions"""
+    return {"detail": exc.detail, "status_code": exc.status_code}
+
+
+class CoverageSettings(BaseAppSettings):
+    """Settings for Coverage service"""
+
+    service_name: str = "svc-coverage"
+
+    # Policy configuration
+    config_dir: str = "config"
+    policy_reload_enabled: bool = True
+
+    # Database
+    postgres_url: str = "postgresql://user:pass@localhost:5432/coverage"
+
+    # External services
+    rag_service_url: str = "http://svc-rag-retriever:8000"
+
+
+# Create app and settings
+app, settings = create_app(
+    service_name="svc-coverage",
+    title="Tax Agent Coverage Policy Service",
+    description="Coverage policy evaluation and clarification service",
+    settings_class=CoverageSettings,
+)
+
+# Global state
+neo4j_client: Neo4jClient | None = None
+event_bus: EventBus | None = None
+policy_loader: PolicyLoader | None = None
+current_policy: Any = None
+
+
+@app.on_event("startup")
+async def startup_event() -> None:
+    """Initialize service dependencies"""
+    global neo4j_client, event_bus, policy_loader, current_policy
+
+    # Setup observability
+    setup_observability(settings)
+
+    # Initialize Neo4j client
+    neo4j_driver = create_neo4j_client(settings)
+    neo4j_client = Neo4jClient(neo4j_driver)
+
+    # Initialize event bus
+    event_bus = create_event_bus(settings)
+
+    # Initialize policy loader
+    policy_loader = get_policy_loader(settings.config_dir)
+
+    # Load initial policy
+    try:
+        policy = policy_loader.load_policy()
+        current_policy = policy_loader.compile_predicates(policy)
+        logger.info("Initial policy loaded", version=policy.version)
+    except Exception as e:
+        logger.error("Failed to load initial policy", error=str(e))
+        current_policy = None
+
+    logger.info("Coverage service started")
+
+
+@app.on_event("shutdown")
+async def shutdown_event() -> None:
+    """Cleanup service dependencies"""
+    global neo4j_client, event_bus
+
+    if neo4j_client:
+        await neo4j_client.close()
+
+    if event_bus:
+        await event_bus.close()
+
+    logger.info("Coverage service stopped")
+
+
+# Request/Response models
+class CheckCoverageRequest(BaseModel):
+    """Request to check document coverage"""
+
+    tax_year: str
+    taxpayer_id: str
+
+
+class ClarifyRequest(BaseModel):
+    """Request to generate clarifying question"""
+
+    gap: CoverageGap
+    context: ClarifyContext
+
+
+class ReloadRequest(BaseModel):
+    """Request to reload policy"""
+
+    force: bool = False
+
+
+# Metrics
+metrics = get_metrics()
+tracer = get_tracer()
+
+
+@app.post("/v1/coverage/check")
+async def check_coverage(
+    request: CheckCoverageRequest,
+    current_user: dict[str, Any] = Depends(get_current_user),
+    tenant_id: str = Depends(get_tenant_id),
+) -> CoverageReport:
+    """Check document coverage for taxpayer"""
+
+    with tracer.start_as_current_span("check_coverage") as span:
+        span.set_attribute("taxpayer_id", request.taxpayer_id)
+        span.set_attribute("tax_year", request.tax_year)
+        span.set_attribute("tenant_id", tenant_id)
+
+        try:
+            if not current_policy:
+                raise HTTPException(status_code=503, detail="Policy not loaded")
+
+            # Create evaluator with KG and RAG clients
+            evaluator = CoverageEvaluator(
+                kg_client=neo4j_client,
+                rag_client=None,  # TODO: Initialize RAG client
+            )
+
+            # Perform coverage evaluation
+            report = await evaluator.check_document_coverage(
+                request.taxpayer_id,
+                request.tax_year,
+                current_policy,
+            )
+
+            # Record audit trail
+            await _record_coverage_audit(report, tenant_id)
+
+            # Update metrics
+            metrics.counter("coverage_checks_total").labels(
+                tenant_id=tenant_id,
+                tax_year=request.tax_year,
+                overall_status=report.overall_status.value,
+            ).inc()
+
+            return report
+
+        except HTTPException:
+            # Re-raise HTTP exceptions as-is
+            raise
+        except Exception as e:
+            logger.error(
+                "Coverage check failed",
+                taxpayer_id=request.taxpayer_id,
+                tax_year=request.tax_year,
+                error=str(e),
+            )
+            raise HTTPException(
+                status_code=500, detail=f"Coverage check failed: {str(e)}"
+            ) from e
+
+
+@app.post("/v1/coverage/clarify")
+async def clarify_gap(
+    request: ClarifyRequest,
+    current_user: dict[str, Any] = Depends(get_current_user),
+    tenant_id: str = Depends(get_tenant_id),
+) -> ClarifyResponse:
+    """Generate clarifying question for coverage gap"""
+
+    with tracer.start_as_current_span("clarify_gap") as span:
+        span.set_attribute("schedule_id", request.gap.schedule_id)
+        span.set_attribute("evidence_id", request.gap.evidence_id)
+        span.set_attribute("tenant_id", tenant_id)
+
+        try:
+            if not current_policy:
+                raise HTTPException(status_code=503, detail="Policy not loaded")
+
+            # Generate clarifying question
+            response = await _generate_clarifying_question(request.gap, request.context)
+
+            # Update metrics
+            metrics.counter("clarifications_total").labels(
+                tenant_id=tenant_id,
+                schedule_id=request.gap.schedule_id,
+                evidence_id=request.gap.evidence_id,
+            ).inc()
+
+            return response
+
+        except HTTPException:
+            # Re-raise HTTP exceptions as-is
+            raise
+        except Exception as e:
+            logger.error(
+                "Clarification failed",
+                gap=request.gap.dict(),
+                error=str(e),
+            )
+            raise HTTPException(
+                status_code=500, detail=f"Clarification failed: {str(e)}"
+            ) from e
+
+
+@app.post("/admin/coverage/reload")
+async def reload_policy(
+    request: ReloadRequest,
+    current_user: dict[str, Any] = Depends(get_current_user),
+    tenant_id: str = Depends(get_tenant_id),
+) -> dict[str, Any]:
+    """Reload coverage policy from files"""
+
+    # Check admin permissions
+    user_groups = current_user.get("groups", [])
+    if "admin" not in user_groups:
+        raise HTTPException(status_code=403, detail="Admin access required")
+
+    with tracer.start_as_current_span("reload_policy") as span:
+        span.set_attribute("tenant_id", tenant_id)
+        span.set_attribute("force", request.force)
+
+        try:
+            global current_policy
+
+            if not policy_loader:
+                raise HTTPException(
+                    status_code=503, detail="Policy loader not initialized"
+                )
+
+            # Load and compile new policy
+            policy = policy_loader.load_policy()
+            new_compiled_policy = policy_loader.compile_predicates(policy)
+
+            # Record new policy version
+            await _record_policy_version(new_compiled_policy, tenant_id)
+
+            # Update current policy
+            current_policy = new_compiled_policy
+
+            logger.info(
+                "Policy reloaded",
+                version=policy.version,
+                hash=new_compiled_policy.hash,
+                tenant_id=tenant_id,
+            )
+
+            return {
+                "success": True,
+                "version": policy.version,
+                "hash": new_compiled_policy.hash,
+                "compiled_at": new_compiled_policy.compiled_at.isoformat(),
+                "source_files": new_compiled_policy.source_files,
+            }
+
+        except PolicyError as e:
+            logger.error("Policy reload failed", error=str(e))
+            raise HTTPException(
+                status_code=400, detail=f"Policy error: {str(e)}"
+            ) from e
+        except Exception as e:
+            logger.error("Policy reload failed", error=str(e))
+            raise HTTPException(
+                status_code=500, detail=f"Reload failed: {str(e)}"
+            ) from e
+
+
+@app.get("/v1/coverage/policy")
+async def get_current_policy(
+    current_user: dict[str, Any] = Depends(get_current_user),
+    tenant_id: str = Depends(get_tenant_id),
+) -> dict[str, Any]:
+    """Get current compiled policy (no secrets, no PII)"""
+
+    with tracer.start_as_current_span("get_policy") as span:
+        span.set_attribute("tenant_id", tenant_id)
+
+        if not current_policy:
+            raise HTTPException(status_code=503, detail="Policy not loaded")
+
+        # Return sanitized policy info
+        return {
+            "version": current_policy.policy.version,
+            "jurisdiction": current_policy.policy.jurisdiction,
+            "tax_year": current_policy.policy.tax_year,
+            "compiled_at": current_policy.compiled_at.isoformat(),
+            "hash": current_policy.hash,
+            "source_files": current_policy.source_files,
+            "schedules": list(current_policy.policy.schedules.keys()),
+            "document_kinds": current_policy.policy.document_kinds,
+        }
+
+
+@app.get("/v1/coverage/validate")
+async def validate_policy(
+    current_user: dict[str, Any] = Depends(get_current_user),
+    tenant_id: str = Depends(get_tenant_id),
+) -> ValidationResult:
+    """Validate current policy configuration"""
+
+    with tracer.start_as_current_span("validate_policy") as span:
+        span.set_attribute("tenant_id", tenant_id)
+
+        try:
+            if not policy_loader:
+                raise HTTPException(
+                    status_code=503, detail="Policy loader not initialized"
+                )
+
+            # Load policy as dict for validation
+            policy_dict = policy_loader._load_yaml_file(
+                os.path.join(settings.config_dir, "coverage.yaml")
+            )
+
+            # Validate policy
+            result = policy_loader.validate_policy(policy_dict)
+
+            # Additional validation: check box existence in KG
+            if neo4j_client and result.ok:
+                box_validation_errors = await _validate_boxes_in_kg(policy_dict)
+                if box_validation_errors:
+                    result.errors.extend(box_validation_errors)
+                    result.ok = False
+
+            return result
+
+        except Exception as e:
+            logger.error("Policy validation failed", error=str(e))
+            return ValidationResult(
+                ok=False,
+                errors=[f"Validation failed: {str(e)}"],
+            )
+
+
+# Helper functions
+
+
+async def _record_coverage_audit(report: CoverageReport, tenant_id: str) -> None:
+    """Record coverage audit trail"""
+    # TODO: Implement database recording
+    logger.info(
+        "Coverage audit recorded",
+        taxpayer_id=report.taxpayer_id,
+        tax_year=report.tax_year,
+        overall_status=report.overall_status.value,
+        blocking_items=len(report.blocking_items),
+        tenant_id=tenant_id,
+    )
+
+
+async def _record_policy_version(compiled_policy: Any, tenant_id: str) -> None:
+    """Record new policy version"""
+    # TODO: Implement database recording
+    logger.info(
+        "Policy version recorded",
+        version=compiled_policy.policy.version,
+        hash=compiled_policy.hash,
+        tenant_id=tenant_id,
+    )
+
+
+async def _generate_clarifying_question(
+    gap: CoverageGap, context: ClarifyContext
+) -> ClarifyResponse:
+    """Generate clarifying question for coverage gap"""
+
+    if not current_policy:
+        raise ValueError("Policy not loaded")
+
+    # Get question template
+    templates = current_policy.policy.question_templates
+    default_template = templates.default
+
+    # Build question text
+    evidence_name = gap.evidence_id
+    schedule_name = gap.schedule_id
+    boxes_text = ", ".join(gap.boxes) if gap.boxes else "relevant boxes"
+    alternatives_text = (
+        ", ".join(gap.acceptable_alternatives)
+        if gap.acceptable_alternatives
+        else "alternative documents"
+    )
+
+    question_text = default_template["text"].format(
+        schedule=schedule_name,
+        tax_year=context.tax_year,
+        evidence=evidence_name,
+        boxes=boxes_text,
+        alternatives=alternatives_text,
+    )
+
+    why_text = default_template["why"].format(
+        why=gap.reason,
+        guidance_doc="policy guidance",
+    )
+
+    # Build upload options
+    options = []
+    if gap.acceptable_alternatives:
+        for alt in gap.acceptable_alternatives:
+            options.append(
+                UploadOption(
+                    label=f"Upload {alt} (PDF/CSV)",
+                    accepted_formats=["pdf", "csv"],
+                    upload_endpoint=f"/v1/ingest/upload?tag={alt}",
+                )
+            )
+    else:
+        options.append(
+            UploadOption(
+                label=f"Upload {evidence_name} (PDF/CSV)",
+                accepted_formats=["pdf", "csv"],
+                upload_endpoint=f"/v1/ingest/upload?tag={evidence_name}",
+            )
+        )
+
+    return ClarifyResponse(
+        question_text=question_text,
+        why_it_is_needed=why_text,
+        citations=gap.citations,
+        options_to_provide=options,
+        blocking=(gap.role.value == "REQUIRED"),
+        boxes_affected=gap.boxes,
+    )
+
+
+async def _validate_boxes_in_kg(policy_dict: dict[str, Any]) -> list[str]:
+    """Validate that all referenced boxes exist in KG"""
+
+    if not neo4j_client:
+        return ["KG client not available for box validation"]
+
+    errors = []
+    all_boxes = set()
+
+    # Collect all box references
+    for schedule in policy_dict.get("schedules", {}).values():
+        for evidence in schedule.get("evidence", []):
+            all_boxes.update(evidence.get("boxes", []))
+
+    if all_boxes:
+        try:
+            from libs.neo import kg_boxes_exist
+
+            box_existence = await kg_boxes_exist(neo4j_client, list(all_boxes))
+
+            for box_id, exists in box_existence.items():
+                if not exists:
+                    errors.append(f"Form box '{box_id}' not found in knowledge graph")
+
+        except Exception as e:
+            errors.append(f"Failed to validate boxes in KG: {str(e)}")
+
+    return errors
+
+
+# Health check endpoints
+@app.get("/healthz")
+async def health_check() -> dict[str, str]:
+    """Health check endpoint"""
+    return {"status": "healthy", "service": "svc-coverage"}
+
+
+@app.get("/readyz")
+async def readiness_check() -> dict[str, str]:
+    """Readiness check endpoint"""
+    return {"status": "ready", "service": "svc-coverage"}
+
+
+@app.get("/livez")
+async def liveness_check() -> dict[str, str]:
+    """Liveness check endpoint"""
+    return {"status": "alive", "service": "svc-coverage"}
+
+
+# Metrics endpoint (internal only)
+@app.get("/metrics")
+async def get_metrics_endpoint() -> str:
+    """Prometheus metrics endpoint"""
+    # This would return Prometheus format metrics
+    return "# Coverage service metrics\n"
+
+
+if __name__ == "__main__":
+    import uvicorn
+
+    uvicorn.run(app, host="0.0.0.0", port=8000)
--- a/apps/svc_coverage/models.py
+++ b/apps/svc_coverage/models.py
@@ -0,0 +1,46 @@
+"""Database models for coverage service."""
+
+# FILE: apps/svc-coverage/models.py
+
+from datetime import datetime
+
+from sqlalchemy import JSON, Column, DateTime, Integer, String
+from sqlalchemy.ext.declarative import declarative_base
+
+Base = declarative_base()
+
+
+class CoverageVersion(Base):
+    """Policy version tracking table"""
+
+    __tablename__ = "coverage_versions"
+
+    id = Column(Integer, primary_key=True, autoincrement=True)
+    version = Column(String(50), nullable=False)
+    jurisdiction = Column(String(10), nullable=False)
+    tax_year = Column(String(10), nullable=False)
+    tenant_id = Column(String(100), nullable=True)
+    source_files = Column(JSON, nullable=False, default=list)
+    compiled_at = Column(DateTime, nullable=False, default=datetime.utcnow)
+    hash = Column(String(64), nullable=False)
+
+    def __repr__(self) -> str:
+        return f"<CoverageVersion(id={self.id}, version='{self.version}', hash='{self.hash[:8]}...')>"
+
+
+class CoverageAudit(Base):
+    """Coverage evaluation audit trail"""
+
+    __tablename__ = "coverage_audit"
+
+    id = Column(Integer, primary_key=True, autoincrement=True)
+    taxpayer_id = Column(String(100), nullable=False)
+    tax_year = Column(String(10), nullable=False)
+    policy_version = Column(String(50), nullable=False)
+    overall_status = Column(String(20), nullable=False)
+    blocking_items = Column(JSON, nullable=False, default=list)
+    created_at = Column(DateTime, nullable=False, default=datetime.utcnow)
+    trace_id = Column(String(100), nullable=True)
+
+    def __repr__(self) -> str:
+        return f"<CoverageAudit(id={self.id}, taxpayer_id='{self.taxpayer_id}', status='{self.overall_status}')>"
--- a/apps/svc_extract/Dockerfile
+++ b/apps/svc_extract/Dockerfile
@@ -0,0 +1,53 @@
+# Multi-stage build for svc-extract
+FROM python:3.12-slim AS builder
+
+# Install build dependencies
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    curl \
+    && rm -rf /var/lib/apt/lists/*
+
+# Create virtual environment
+RUN python -m venv /opt/venv
+ENV PATH="/opt/venv/bin:$PATH"
+
+# Copy requirements and install dependencies
+COPY libs/requirements-base.txt /tmp/libs-requirements.txt
+COPY apps/svc_extract/requirements.txt /tmp/requirements.txt
+RUN pip install --no-cache-dir --upgrade pip && \
+    pip install --no-cache-dir -r /tmp/libs-requirements.txt -r /tmp/requirements.txt
+
+# Production stage
+FROM python:3.12-slim
+
+# Install runtime dependencies
+RUN apt-get update && apt-get install -y \
+    curl \
+    && rm -rf /var/lib/apt/lists/* \
+    && groupadd -r appuser \
+    && useradd -r -g appuser appuser
+
+# Copy virtual environment from builder
+COPY --from=builder /opt/venv /opt/venv
+ENV PATH="/opt/venv/bin:$PATH"
+
+# Set working directory
+WORKDIR /app
+
+# Copy application code
+COPY libs/ ./libs/
+COPY apps/svc_extract/ ./apps/svc_extract/
+
+# Create non-root user and set permissions
+RUN chown -R appuser:appuser /app
+USER appuser
+
+# Health check
+HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
+    CMD curl -f http://localhost:8000/healthz || exit 1
+
+# Expose port
+EXPOSE 8000
+
+# Run the application
+CMD ["python", "-m", "uvicorn", "apps.svc_extract.main:app", "--host", "0.0.0.0", "--port", "8000"]
--- a/apps/svc_extract/main.py
+++ b/apps/svc_extract/main.py
@@ -0,0 +1,625 @@
+"""LLM-based field extraction with confidence scoring and provenance tracking."""
+
+# FILE: apps/svc-extract/main.py
+# pylint: disable=wrong-import-position,import-error,too-few-public-methods,global-statement
+# pylint: disable=global-variable-not-assigned,raise-missing-from,unused-argument
+# pylint: disable=broad-exception-caught,no-else-return,too-many-arguments,too-many-positional-arguments
+# pylint: disable=too-many-locals,import-outside-toplevel
+
+import os
+
+# Import shared libraries
+import sys
+from datetime import datetime
+from typing import Any
+
+import structlog
+import ulid
+from fastapi import BackgroundTasks, Depends, HTTPException, Request
+from fastapi.responses import JSONResponse
+
+sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
+
+from libs.app_factory import create_app
+from libs.calibration import ConfidenceCalibrator
+from libs.config import BaseAppSettings, create_event_bus, create_minio_client
+from libs.events import EventBus, EventPayload, EventTopics
+from libs.observability import get_metrics, get_tracer, setup_observability
+from libs.schemas import ErrorResponse, ExtractionRequest, ExtractionResponse
+from libs.security import (
+    create_trusted_proxy_middleware,
+    get_current_user,
+    get_tenant_id,
+)
+from libs.storage import DocumentStorage, StorageClient
+
+logger = structlog.get_logger()
+
+
+class ExtractionSettings(BaseAppSettings):
+    """Settings for extraction service"""
+
+    service_name: str = "svc-extract"
+
+    # LLM configuration
+    openai_api_key: str = ""
+    model_name: str = "gpt-4"
+    max_tokens: int = 2000
+    temperature: float = 0.1
+
+    # Extraction configuration
+    confidence_threshold: float = 0.7
+    max_retries: int = 3
+    chunk_size: int = 4000
+
+    # Prompt templates
+    extraction_prompt_template: str = """
+Extract the following fields from this document text:
+{field_definitions}
+
+Document text:
+{document_text}
+
+Return a JSON object with the extracted fields and confidence scores.
+"""
+
+
+# Create app and settings
+app, settings = create_app(
+    service_name="svc-extract",
+    title="Tax Agent Extraction Service",
+    description="LLM-based field extraction service",
+    settings_class=ExtractionSettings,
+)
+
+# Add middleware
+middleware_factory = create_trusted_proxy_middleware(settings.internal_cidrs)
+app.add_middleware(middleware_factory)
+
+# Global clients
+storage_client: StorageClient | None = None
+document_storage: DocumentStorage | None = None
+event_bus: EventBus | None = None
+confidence_calibrator: ConfidenceCalibrator | None = None
+tracer = get_tracer("svc-extract")
+metrics = get_metrics()
+
+
+@app.on_event("startup")
+async def startup_event() -> None:
+    """Initialize service dependencies"""
+    global storage_client, document_storage, event_bus, confidence_calibrator
+
+    logger.info("Starting extraction service")
+
+    # Setup observability
+    setup_observability(settings)
+
+    # Initialize MinIO client
+    minio_client = create_minio_client(settings)
+    storage_client = StorageClient(minio_client)
+    document_storage = DocumentStorage(storage_client)
+
+    # Initialize event bus
+    event_bus = create_event_bus(settings)
+    if not event_bus:
+        raise Exception("Event bus not initialized")
+
+    await event_bus.start()
+
+    # Subscribe to OCR completion events
+    await event_bus.subscribe(EventTopics.DOC_OCR_READY, _handle_ocr_ready)
+
+    # Initialize confidence calibrator
+    confidence_calibrator = ConfidenceCalibrator(method="temperature")
+
+    logger.info("Extraction service started successfully")
+
+
+@app.on_event("shutdown")
+async def shutdown_event() -> None:
+    """Cleanup service dependencies"""
+    global event_bus
+
+    logger.info("Shutting down extraction service")
+
+    if event_bus:
+        await event_bus.stop()
+
+    logger.info("Extraction service shutdown complete")
+
+
+@app.get("/healthz")
+async def health_check() -> dict[str, Any]:
+    """Health check endpoint"""
+    return {
+        "status": "healthy",
+        "service": settings.service_name,
+        "version": settings.service_version,
+        "timestamp": datetime.utcnow().isoformat(),
+    }
+
+
+@app.get("/readyz")
+async def readiness_check() -> dict[str, Any]:
+    """Readiness check endpoint"""
+    return {
+        "status": "ready",
+        "service": settings.service_name,
+        "version": settings.service_version,
+        "timestamp": datetime.utcnow().isoformat(),
+    }
+
+
+@app.get("/livez")
+async def liveness_check() -> dict[str, Any]:
+    """Liveness check endpoint"""
+    return {
+        "status": "alive",
+        "service": settings.service_name,
+        "version": settings.service_version,
+        "timestamp": datetime.utcnow().isoformat(),
+    }
+
+
+@app.post("/extract/{doc_id}", response_model=ExtractionResponse)
+async def extract_fields(
+    doc_id: str,
+    request_data: ExtractionRequest,
+    background_tasks: BackgroundTasks,
+    current_user: dict[str, Any] = Depends(get_current_user()),
+    tenant_id: str = Depends(get_tenant_id()),
+) -> ExtractionResponse:
+    """Extract fields from document"""
+
+    with tracer.start_as_current_span("extract_fields") as span:
+        span.set_attribute("doc_id", doc_id)
+        span.set_attribute("tenant_id", tenant_id)
+        span.set_attribute("strategy", request_data.strategy)
+
+        try:
+            # Check if OCR results exist
+            ocr_results = (
+                await document_storage.get_ocr_result(tenant_id, doc_id)
+                if document_storage
+                else None
+            )
+            if not ocr_results:
+                raise HTTPException(status_code=404, detail="OCR results not found")
+
+            # Generate extraction ID
+            extraction_id = str(ulid.new())
+            span.set_attribute("extraction_id", extraction_id)
+
+            # Start background extraction
+            background_tasks.add_task(
+                _extract_fields_async,
+                doc_id,
+                tenant_id,
+                ocr_results,
+                request_data.strategy,
+                extraction_id,
+                current_user.get("sub", "system"),
+            )
+
+            logger.info(
+                "Field extraction started", doc_id=doc_id, extraction_id=extraction_id
+            )
+
+            return ExtractionResponse(
+                extraction_id=extraction_id,
+                confidence=0.0,  # Will be updated when processing completes
+                extracted_fields={},
+                provenance=[],
+            )
+
+        except HTTPException:
+            raise
+        except Exception as e:
+            logger.error("Failed to start extraction", doc_id=doc_id, error=str(e))
+            raise HTTPException(status_code=500, detail="Failed to start extraction")
+
+
+@app.get("/results/{doc_id}")
+async def get_extraction_results(
+    doc_id: str,
+    current_user: dict[str, Any] = Depends(get_current_user()),
+    tenant_id: str = Depends(get_tenant_id()),
+) -> ExtractionResponse:
+    """Get extraction results for document"""
+
+    with tracer.start_as_current_span("get_extraction_results") as span:
+        span.set_attribute("doc_id", doc_id)
+        span.set_attribute("tenant_id", tenant_id)
+
+        try:
+            # Get extraction results from storage
+            extraction_results = (
+                await document_storage.get_extraction_result(tenant_id, doc_id)
+                if document_storage
+                else None
+            )
+
+            if not extraction_results:
+                raise HTTPException(
+                    status_code=404, detail="Extraction results not found"
+                )
+
+            # pylint: disable-next=not-a-mapping
+            return ExtractionResponse(**extraction_results)
+
+        except HTTPException:
+            raise
+        except Exception as e:
+            logger.error(
+                "Failed to get extraction results", doc_id=doc_id, error=str(e)
+            )
+            raise HTTPException(
+                status_code=500, detail="Failed to get extraction results"
+            )
+
+
+async def _handle_ocr_ready(topic: str, payload: EventPayload) -> None:
+    """Handle OCR completion events"""
+    try:
+        data = payload.data
+        doc_id = data.get("doc_id")
+        tenant_id = data.get("tenant_id")
+
+        if not doc_id or not tenant_id:
+            logger.warning("Invalid OCR ready event", data=data)
+            return
+
+        logger.info("Auto-extracting fields from OCR results", doc_id=doc_id)
+
+        # Get OCR results
+        ocr_results = data.get("ocr_results")
+        if not ocr_results:
+            ocr_results = (
+                await document_storage.get_ocr_result(tenant_id, doc_id)
+                if document_storage
+                else None
+            )
+
+        if ocr_results:
+            await _extract_fields_async(
+                doc_id=doc_id,
+                tenant_id=tenant_id,
+                ocr_results=ocr_results,
+                strategy="hybrid",
+                extraction_id=str(ulid.new()),
+                actor=payload.actor,
+            )
+
+    except Exception as e:
+        logger.error("Failed to handle OCR ready event", error=str(e))
+
+
+async def _extract_fields_async(
+    doc_id: str,
+    tenant_id: str,
+    ocr_results: dict[str, Any],
+    strategy: str,
+    extraction_id: str,
+    actor: str,
+) -> None:
+    """Extract fields asynchronously"""
+
+    with tracer.start_as_current_span("extract_fields_async") as span:
+        span.set_attribute("doc_id", doc_id)
+        span.set_attribute("extraction_id", extraction_id)
+        span.set_attribute("strategy", strategy)
+
+        try:
+            # Extract text from OCR results
+            document_text = _extract_text_from_ocr(ocr_results)
+
+            # Determine field definitions based on document type
+            field_definitions = _get_field_definitions(doc_id, document_text)
+
+            # Perform extraction
+            if strategy == "llm":
+                extracted_fields, confidence, provenance = await _extract_with_llm(
+                    document_text, field_definitions, ocr_results
+                )
+            elif strategy == "rules":
+                extracted_fields, confidence, provenance = await _extract_with_rules(
+                    document_text, field_definitions, ocr_results
+                )
+            elif strategy == "hybrid":
+                # Combine LLM and rules-based extraction
+                llm_fields, llm_conf, llm_prov = await _extract_with_llm(
+                    document_text, field_definitions, ocr_results
+                )
+                rules_fields, rules_conf, rules_prov = await _extract_with_rules(
+                    document_text, field_definitions, ocr_results
+                )
+
+                extracted_fields, confidence, provenance = _merge_extractions(
+                    llm_fields, llm_conf, llm_prov, rules_fields, rules_conf, rules_prov
+                )
+            else:
+                raise ValueError(f"Unknown strategy: {strategy}")
+
+            # Calibrate confidence
+            if confidence_calibrator and confidence_calibrator.is_fitted:
+                calibrated_confidence = confidence_calibrator.calibrate([confidence])[0]
+            else:
+                calibrated_confidence = confidence
+
+            # Create extraction results
+            extraction_results = {
+                "doc_id": doc_id,
+                "extraction_id": extraction_id,
+                "strategy": strategy,
+                "extracted_at": datetime.utcnow().isoformat(),
+                "confidence": calibrated_confidence,
+                "raw_confidence": confidence,
+                "extracted_fields": extracted_fields,
+                "provenance": provenance,
+                "field_count": len(extracted_fields),
+            }
+
+            # Store results
+            if document_storage:
+                await document_storage.store_extraction_result(
+                    tenant_id, doc_id, extraction_results
+                )
+
+            # Update metrics
+            metrics.counter("extractions_completed_total").labels(
+                tenant_id=tenant_id, strategy=strategy
+            ).inc()
+
+            metrics.histogram("extraction_confidence").labels(
+                strategy=strategy
+            ).observe(calibrated_confidence)
+
+            # Publish completion event
+            event_payload = EventPayload(
+                data={
+                    "doc_id": doc_id,
+                    "tenant_id": tenant_id,
+                    "extraction_id": extraction_id,
+                    "strategy": strategy,
+                    "confidence": calibrated_confidence,
+                    "field_count": len(extracted_fields),
+                    "extraction_results": extraction_results,
+                },
+                actor=actor,
+                tenant_id=tenant_id,
+            )
+
+            if event_bus:
+                await event_bus.publish(EventTopics.DOC_EXTRACTED, event_payload)
+
+            logger.info(
+                "Field extraction completed",
+                doc_id=doc_id,
+                fields=len(extracted_fields),
+                confidence=calibrated_confidence,
+            )
+
+        except Exception as e:
+            logger.error("Field extraction failed", doc_id=doc_id, error=str(e))
+
+            # Update error metrics
+            metrics.counter("extraction_errors_total").labels(
+                tenant_id=tenant_id, strategy=strategy, error_type=type(e).__name__
+            ).inc()
+
+
+def _extract_text_from_ocr(ocr_results: dict[str, Any]) -> str:
+    """Extract text from OCR results"""
+    text_parts = []
+
+    for page in ocr_results.get("pages", []):
+        if "text" in page:
+            text_parts.append(page["text"])
+        elif "tesseract" in page and "text" in page["tesseract"]:
+            text_parts.append(page["tesseract"]["text"])
+
+    return "\n\n".join(text_parts)
+
+
+def _get_field_definitions(doc_id: str, document_text: str) -> dict[str, str]:
+    """Get field definitions based on document type"""
+
+    # Analyze document text to determine type
+    text_lower = document_text.lower()
+
+    if "invoice" in text_lower or "bill" in text_lower:
+        return {
+            "invoice_number": "Invoice or bill number",
+            "date": "Invoice date",
+            "supplier_name": "Supplier or vendor name",
+            "total_amount": "Total amount including VAT",
+            "net_amount": "Net amount excluding VAT",
+            "vat_amount": "VAT amount",
+            "description": "Description of goods or services",
+        }
+    elif "bank statement" in text_lower or "account statement" in text_lower:
+        return {
+            "account_number": "Bank account number",
+            "sort_code": "Bank sort code",
+            "statement_period": "Statement period",
+            "opening_balance": "Opening balance",
+            "closing_balance": "Closing balance",
+            "transactions": "List of transactions",
+        }
+    elif "receipt" in text_lower:
+        return {
+            "merchant_name": "Merchant or store name",
+            "date": "Receipt date",
+            "total_amount": "Total amount paid",
+            "payment_method": "Payment method used",
+            "items": "List of items purchased",
+        }
+    else:
+        # Generic fields
+        return {
+            "date": "Any dates mentioned",
+            "amount": "Any monetary amounts",
+            "names": "Any person or company names",
+            "addresses": "Any addresses",
+            "reference_numbers": "Any reference or account numbers",
+        }
+
+
+async def _extract_with_llm(
+    document_text: str, field_definitions: dict[str, str], ocr_results: dict[str, Any]
+) -> tuple[dict[str, Any], float, list[dict[str, Any]]]:
+    """Extract fields using LLM"""
+
+    try:
+        # This would integrate with OpenAI API
+        # For now, return mock extraction
+        logger.warning("LLM extraction not implemented, using mock data")
+
+        extracted_fields = {}
+        provenance = []
+
+        # Mock extraction based on field definitions
+        for field_name, _field_desc in field_definitions.items():
+            if "amount" in field_name.lower():
+                extracted_fields[field_name] = "£1,234.56"
+            elif "date" in field_name.lower():
+                extracted_fields[field_name] = "2024-01-15"
+            elif "name" in field_name.lower():
+                extracted_fields[field_name] = "Example Company Ltd"
+            else:
+                extracted_fields[field_name] = f"Mock {field_name}"
+
+            # Add provenance
+            provenance.append(
+                {
+                    "field": field_name,
+                    "value": extracted_fields[field_name],
+                    "confidence": 0.8,
+                    "source": "llm",
+                    "page": 1,
+                    "bbox": [100, 100, 200, 120],
+                }
+            )
+
+        return extracted_fields, 0.8, provenance
+
+    except Exception as e:
+        logger.error("LLM extraction failed", error=str(e))
+        return {}, 0.0, []
+
+
+async def _extract_with_rules(
+    document_text: str, field_definitions: dict[str, str], ocr_results: dict[str, Any]
+) -> tuple[dict[str, Any], float, list[dict[str, Any]]]:
+    """Extract fields using rules-based approach"""
+
+    import re
+
+    extracted_fields = {}
+    provenance = []
+
+    # Define extraction patterns
+    patterns = {
+        "amount": r"£\d{1,3}(?:,\d{3})*(?:\.\d{2})?",
+        "date": r"\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b",
+        "invoice_number": r"(?:invoice|inv|bill)\s*#?\s*(\w+)",
+        "account_number": r"\b\d{8}\b",
+        "sort_code": r"\b\d{2}-\d{2}-\d{2}\b",
+    }
+
+    for field_name, _field_desc in field_definitions.items():
+        # Find matching pattern
+        pattern_key = None
+        for key in patterns:
+            if key in field_name.lower():
+                pattern_key = key
+                break
+
+        if pattern_key:
+            pattern = patterns[pattern_key]
+            matches = re.finditer(pattern, document_text, re.IGNORECASE)
+
+            for match in matches:
+                value = match.group(1) if match.groups() else match.group(0)
+                extracted_fields[field_name] = value
+
+                provenance.append(
+                    {
+                        "field": field_name,
+                        "value": value,
+                        "confidence": 0.9,
+                        "source": "rules",
+                        "pattern": pattern,
+                        "match_start": match.start(),
+                        "match_end": match.end(),
+                    }
+                )
+                break  # Take first match
+
+    confidence = 0.9 if extracted_fields else 0.0
+    return extracted_fields, confidence, provenance
+
+
+def _merge_extractions(
+    llm_fields: dict[str, Any],
+    llm_conf: float,
+    llm_prov: list[dict[str, Any]],
+    rules_fields: dict[str, Any],
+    rules_conf: float,
+    rules_prov: list[dict[str, Any]],
+) -> tuple[dict[str, Any], float, list[dict[str, Any]]]:
+    """Merge LLM and rules-based extractions"""
+
+    merged_fields = {}
+    merged_provenance = []
+
+    # Get all field names
+    all_fields = set(llm_fields.keys()) | set(rules_fields.keys())
+
+    for field in all_fields:
+        llm_value = llm_fields.get(field)
+        rules_value = rules_fields.get(field)
+
+        # Prefer rules-based extraction for structured fields
+        if rules_value and field in ["amount", "date", "account_number", "sort_code"]:
+            merged_fields[field] = rules_value
+            # Find provenance for this field
+            for prov in rules_prov:
+                if prov["field"] == field:
+                    merged_provenance.append(prov)
+                    break
+        elif llm_value:
+            merged_fields[field] = llm_value
+            # Find provenance for this field
+            for prov in llm_prov:
+                if prov["field"] == field:
+                    merged_provenance.append(prov)
+                    break
+
+    # Calculate combined confidence
+    combined_confidence = (llm_conf + rules_conf) / 2
+
+    return merged_fields, combined_confidence, merged_provenance
+
+
+@app.exception_handler(HTTPException)
+async def http_exception_handler(request: Request, exc: HTTPException) -> JSONResponse:
+    """Handle HTTP exceptions with RFC7807 format"""
+    return JSONResponse(
+        status_code=exc.status_code,
+        content=ErrorResponse(
+            type=f"https://httpstatuses.com/{exc.status_code}",
+            title=exc.detail,
+            status=exc.status_code,
+            detail=exc.detail,
+            instance=str(request.url),
+            trace_id=getattr(request.state, "trace_id", None),
+        ).model_dump(),
+    )
+
+
+if __name__ == "__main__":
+    import uvicorn
+
+    uvicorn.run("main:app", host="0.0.0.0", port=8003, reload=True, log_config=None)
--- a/apps/svc_extract/requirements.txt
+++ b/apps/svc_extract/requirements.txt
@@ -0,0 +1,17 @@
+# Service-specific dependencies for svc_extract
+# LLM integration
+openai>=1.3.0
+anthropic>=0.7.0
+
+# JSON schema validation
+jsonschema>=4.20.0
+
+# Template processing
+jinja2>=3.1.0
+
+# Text similarity (lightweight)
+fuzzywuzzy>=0.18.0
+python-Levenshtein>=0.23.0
+
+# Data validation
+cerberus>=1.3.4
--- a/apps/svc_firm_connectors/Dockerfile
+++ b/apps/svc_firm_connectors/Dockerfile
@@ -0,0 +1,53 @@
+# Multi-stage build for svc_firm_connectors
+FROM python:3.12-slim AS builder
+
+# Install build dependencies
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    curl \
+    && rm -rf /var/lib/apt/lists/*
+
+# Create virtual environment
+RUN python -m venv /opt/venv
+ENV PATH="/opt/venv/bin:$PATH"
+
+# Copy requirements and install dependencies
+COPY libs/requirements-base.txt /tmp/libs-requirements.txt
+COPY apps/svc_firm_connectors/requirements.txt /tmp/requirements.txt
+RUN pip install --no-cache-dir --upgrade pip && \
+    pip install --no-cache-dir -r /tmp/libs-requirements.txt -r /tmp/requirements.txt
+
+# Production stage
+FROM python:3.12-slim
+
+# Install runtime dependencies
+RUN apt-get update && apt-get install -y \
+    curl \
+    && rm -rf /var/lib/apt/lists/* \
+    && groupadd -r appuser \
+    && useradd -r -g appuser appuser
+
+# Copy virtual environment from builder
+COPY --from=builder /opt/venv /opt/venv
+ENV PATH="/opt/venv/bin:$PATH"
+
+# Set working directory
+WORKDIR /app
+
+# Copy application code
+COPY libs/ ./libs/
+COPY apps/svc_firm_connectors/ ./apps/svc_firm_connectors/
+
+# Create non-root user and set permissions
+RUN chown -R appuser:appuser /app
+USER appuser
+
+# Health check
+HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
+    CMD curl -f http://localhost:8000/healthz || exit 1
+
+# Expose port
+EXPOSE 8000
+
+# Run the application
+CMD ["python", "-m", "uvicorn", "apps.svc_firm_connectors.main:app", "--host", "0.0.0.0", "--port", "8000"]
--- a/apps/svc_firm_connectors/main.py
+++ b/apps/svc_firm_connectors/main.py
@@ -0,0 +1,762 @@
+# FILE: apps/svc-firm-connectors/main.py
+# mypy: disable-error-code=union-attr
+# Firm database integration with practice management systems
+
+import asyncio
+import json
+import os
+
+# Import shared libraries
+import sys
+from datetime import datetime
+from typing import Any
+
+import structlog
+import ulid
+from fastapi import BackgroundTasks, Depends, HTTPException, Request
+from fastapi.responses import JSONResponse
+
+sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
+
+from libs.app_factory import create_app
+from libs.config import (
+    BaseAppSettings,
+    create_event_bus,
+    create_neo4j_client,
+    create_vault_client,
+)
+from libs.events import EventBus, EventPayload, EventTopics
+from libs.neo import Neo4jClient
+from libs.observability import get_metrics, get_tracer, setup_observability
+from libs.schemas import ErrorResponse, FirmSyncRequest, FirmSyncResponse
+from libs.security import VaultTransitHelper, get_current_user, get_tenant_id
+
+logger = structlog.get_logger()
+
+
+class FirmConnectorsSettings(BaseAppSettings):
+    """Settings for firm connectors service"""
+
+    service_name: str = "svc-firm-connectors"
+
+    # Supported practice management systems
+    supported_systems: list[str] = [
+        "iris",
+        "sage",
+        "xero",
+        "quickbooks",
+        "freeagent",
+        "kashflow",
+    ]
+
+    # Sync configuration
+    sync_batch_size: int = 100
+    max_sync_retries: int = 3
+    sync_timeout: int = 300  # 5 minutes
+
+    # Rate limiting
+    api_rate_limit: int = 100  # requests per minute
+
+    # Data mapping
+    field_mappings_dir: str = "config/firm_mappings"
+
+
+# Create app and settings
+app, settings = create_app(
+    service_name="svc-firm-connectors",
+    title="Tax Agent Firm Connectors Service",
+    description="Practice management system integration",
+    settings_class=FirmConnectorsSettings,
+)
+
+# Global clients
+vault_helper: VaultTransitHelper | None = None
+neo4j_client: Neo4jClient | None = None
+event_bus: EventBus | None = None
+tracer = get_tracer("svc-firm-connectors")
+metrics = get_metrics()
+
+
+@app.on_event("startup")
+async def startup_event() -> None:
+    """Initialize service dependencies"""
+    global vault_helper, neo4j_client, event_bus
+
+    logger.info("Starting firm connectors service")
+
+    # Setup observability
+    setup_observability(settings)
+
+    # Initialize Vault helper
+    vault_client = create_vault_client(settings)
+    vault_helper = VaultTransitHelper(vault_client, "tax-agent-transit")
+
+    # Initialize Neo4j client
+    neo4j_driver = create_neo4j_client(settings)
+    neo4j_client = Neo4jClient(neo4j_driver)
+
+    # Initialize event bus
+    event_bus = create_event_bus(settings)
+    await event_bus.start() # fmt: skip # pyright: ignore[reportOptionalMemberAccess]
+
+    logger.info("Firm connectors service started successfully")
+
+
+@app.on_event("shutdown")
+async def shutdown_event() -> None:
+    """Cleanup service dependencies"""
+    global neo4j_client, event_bus
+
+    logger.info("Shutting down firm connectors service")
+
+    if neo4j_client:
+        await neo4j_client.close()
+
+    if event_bus:
+        await event_bus.stop()
+
+    logger.info("Firm connectors service shutdown complete")
+
+
+@app.get("/health")
+async def health_check() -> dict[str, Any]:
+    """Health check endpoint"""
+    return {
+        "status": "healthy",
+        "service": settings.service_name,
+        "version": settings.service_version,
+        "timestamp": datetime.utcnow().isoformat(),
+        "supported_systems": settings.supported_systems,
+    }
+
+
+@app.post("/sync", response_model=FirmSyncResponse)
+async def sync_firm_data(
+    request_data: FirmSyncRequest,
+    background_tasks: BackgroundTasks,
+    current_user: dict[str, Any] = Depends(get_current_user),
+    tenant_id: str = Depends(get_tenant_id),
+) -> FirmSyncResponse:
+    """Sync data from practice management system"""
+
+    with tracer.start_as_current_span("sync_firm_data") as span:
+        span.set_attribute("system", request_data.system)
+        span.set_attribute("tenant_id", tenant_id)
+        span.set_attribute("sync_type", request_data.sync_type)
+
+        try:
+            # Validate system
+            if request_data.system not in settings.supported_systems:
+                raise HTTPException(
+                    status_code=400, detail=f"Unsupported system: {request_data.system}"
+                )
+
+            # Generate sync ID
+            sync_id = str(ulid.new())
+            span.set_attribute("sync_id", sync_id)
+
+            # Start background sync
+            background_tasks.add_task(
+                _sync_firm_data_async,
+                request_data.system,
+                request_data.sync_type,
+                request_data.connection_config,
+                tenant_id,
+                sync_id,
+                current_user.get("sub", "system"),
+            )
+
+            logger.info(
+                "Firm data sync started",
+                sync_id=sync_id,
+                system=request_data.system,
+                sync_type=request_data.sync_type,
+            )
+
+            return FirmSyncResponse(
+                firm_id=request_data.firm_id,
+                status="syncing",
+                message=f"Sync started with ID: {sync_id}",
+                synced_entities=0,
+                errors=[],
+            )
+
+        except HTTPException:
+            raise
+        except Exception as e:
+            logger.error("Failed to start firm sync", error=str(e))
+            raise HTTPException(status_code=500, detail="Failed to start firm sync")
+
+
+@app.get("/sync/{sync_id}")
+async def get_sync_status(
+    sync_id: str,
+    current_user: dict[str, Any] = Depends(get_current_user),
+    tenant_id: str = Depends(get_tenant_id),
+) -> dict[str, Any]:
+    """Get sync status"""
+
+    with tracer.start_as_current_span("get_sync_status") as span:
+        span.set_attribute("sync_id", sync_id)
+        span.set_attribute("tenant_id", tenant_id)
+
+        try:
+            # Get sync record from Neo4j
+            query = """
+            MATCH (s:FirmSync {sync_id: $sync_id, tenant_id: $tenant_id})
+            WHERE s.retracted_at IS NULL
+            RETURN s
+            """
+
+            results = await neo4j_client.run_query(  # pyright: ignore[reportOptionalMemberAccess]
+                query, {"sync_id": sync_id, "tenant_id": tenant_id}
+            )
+
+            if not results:
+                raise HTTPException(status_code=404, detail="Sync not found")
+
+            sync_record = results[0]["s"]
+
+            return {
+                "sync_id": sync_id,
+                "system": sync_record.get("system"),
+                "status": sync_record.get("status"),
+                "records_synced": sync_record.get("records_synced", 0),
+                "total_records": sync_record.get("total_records", 0),
+                "started_at": sync_record.get("started_at"),
+                "completed_at": sync_record.get("completed_at"),
+                "errors": json.loads(sync_record.get("errors", "[]")),
+            }
+
+        except HTTPException:
+            raise
+        except Exception as e:
+            logger.error("Failed to get sync status", sync_id=sync_id, error=str(e))
+            raise HTTPException(status_code=500, detail="Failed to get sync status")
+
+
+@app.post("/connections/{system}/test")
+async def test_connection(
+    system: str,
+    connection_config: dict[str, Any],
+    current_user: dict[str, Any] = Depends(get_current_user),
+    tenant_id: str = Depends(get_tenant_id),
+) -> dict[str, Any]:
+    """Test connection to practice management system"""
+
+    with tracer.start_as_current_span("test_connection") as span:
+        span.set_attribute("system", system)
+        span.set_attribute("tenant_id", tenant_id)
+
+        try:
+            # Validate system
+            if system not in settings.supported_systems:
+                raise HTTPException(
+                    status_code=400, detail=f"Unsupported system: {system}"
+                )
+
+            # Test connection based on system
+            if system == "iris":
+                result = await _test_iris_connection(connection_config)
+            elif system == "sage":
+                result = await _test_sage_connection(connection_config)
+            elif system == "xero":
+                result = await _test_xero_connection(connection_config)
+            elif system == "quickbooks":
+                result = await _test_quickbooks_connection(connection_config)
+            elif system == "freeagent":
+                result = await _test_freeagent_connection(connection_config)
+            elif system == "kashflow":
+                result = await _test_kashflow_connection(connection_config)
+            else:
+                raise HTTPException(
+                    status_code=400,
+                    detail=f"Connection test not implemented for {system}",
+                )
+
+            return {
+                "system": system,
+                "connection_status": result["status"],
+                "message": result["message"],
+                "capabilities": result.get("capabilities", []),
+                "test_timestamp": datetime.utcnow().isoformat(),
+            }
+
+        except HTTPException:
+            raise
+        except Exception as e:
+            logger.error("Connection test failed", system=system, error=str(e))
+            raise HTTPException(
+                status_code=500, detail=f"Connection test failed: {str(e)}"
+            )
+
+
+@app.get("/systems")
+async def list_supported_systems(
+    current_user: dict[str, Any] = Depends(get_current_user),
+    tenant_id: str = Depends(get_tenant_id),
+) -> dict[str, Any]:
+    """List supported practice management systems"""
+
+    try:
+        systems_info: list[Any] = []
+
+        for system in settings.supported_systems:
+            system_info = {
+                "system": system,
+                "name": _get_system_name(system),
+                "capabilities": _get_system_capabilities(system),
+                "connection_fields": _get_connection_fields(system),
+            }
+            systems_info.append(system_info)
+
+        return {"supported_systems": systems_info, "total_systems": len(systems_info)}
+
+    except Exception as e:
+        logger.error("Failed to list systems", error=str(e))
+        raise HTTPException(status_code=500, detail="Failed to list systems")
+
+
+async def _sync_firm_data_async(
+    system: str,
+    sync_type: str,
+    connection_config: dict[str, Any],
+    tenant_id: str,
+    sync_id: str,
+    actor: str,
+) -> None:
+    """Sync firm data asynchronously"""
+
+    with tracer.start_as_current_span("sync_firm_data_async") as span:
+        span.set_attribute("sync_id", sync_id)
+        span.set_attribute("system", system)
+        span.set_attribute("sync_type", sync_type)
+
+        try:
+            # Create sync record
+            await _create_sync_record(sync_id, system, sync_type, tenant_id)
+
+            # Perform sync based on system
+            if system == "iris":
+                sync_result = await _sync_iris_data(
+                    connection_config, sync_type, tenant_id
+                )
+            elif system == "sage":
+                sync_result = await _sync_sage_data(
+                    connection_config, sync_type, tenant_id
+                )
+            elif system == "xero":
+                sync_result = await _sync_xero_data(
+                    connection_config, sync_type, tenant_id
+                )
+            elif system == "quickbooks":
+                sync_result = await _sync_quickbooks_data(
+                    connection_config, sync_type, tenant_id
+                )
+            elif system == "freeagent":
+                sync_result = await _sync_freeagent_data(
+                    connection_config, sync_type, tenant_id
+                )
+            elif system == "kashflow":
+                sync_result = await _sync_kashflow_data(
+                    connection_config, sync_type, tenant_id
+                )
+            else:
+                raise Exception(f"Sync not implemented for {system}")
+
+            # Update sync record
+            await _update_sync_record(sync_id, "completed", sync_result)
+
+            # Update metrics
+            metrics.counter("firm_syncs_completed_total").labels(
+                tenant_id=tenant_id, system=system, sync_type=sync_type
+            ).inc()
+
+            metrics.histogram("sync_records_count").labels(
+                system=system, sync_type=sync_type
+            ).observe(sync_result["records_synced"])
+
+            # Publish completion event
+            event_payload = EventPayload(
+                data={
+                    "sync_id": sync_id,
+                    "system": system,
+                    "sync_type": sync_type,
+                    "tenant_id": tenant_id,
+                    "records_synced": sync_result["records_synced"],
+                    "entities_created": sync_result.get("entities_created", 0),
+                },
+                actor=actor,
+                tenant_id=tenant_id,
+            )
+
+            await event_bus.publish(EventTopics.FIRM_SYNC_COMPLETED, event_payload)  # type: ignore
+
+            logger.info(
+                "Firm sync completed",
+                sync_id=sync_id,
+                system=system,
+                records=sync_result["records_synced"],
+            )
+
+        except Exception as e:
+            logger.error("Firm sync failed", sync_id=sync_id, error=str(e))
+
+            # Update sync record with error
+            await _update_sync_record(sync_id, "error", {"error": str(e)})
+
+            # Update error metrics
+            metrics.counter("firm_sync_errors_total").labels(
+                tenant_id=tenant_id, system=system, error_type=type(e).__name__
+            ).inc()
+
+
+async def _test_iris_connection(config: dict[str, Any]) -> dict[str, Any]:
+    """Test IRIS connection"""
+    # Mock implementation
+    await asyncio.sleep(1)
+    return {
+        "status": "success",
+        "message": "Connection successful",
+        "capabilities": ["clients", "jobs", "documents"],
+    }
+
+
+async def _test_sage_connection(config: dict[str, Any]) -> dict[str, Any]:
+    """Test Sage connection"""
+    # Mock implementation
+    await asyncio.sleep(1)
+    return {
+        "status": "success",
+        "message": "Connection successful",
+        "capabilities": ["customers", "suppliers", "transactions"],
+    }
+
+
+async def _test_xero_connection(config: dict[str, Any]) -> dict[str, Any]:
+    """Test Xero connection"""
+    # Mock implementation
+    await asyncio.sleep(1)
+    return {
+        "status": "success",
+        "message": "Connection successful",
+        "capabilities": ["contacts", "invoices", "bank_transactions"],
+    }
+
+
+async def _test_quickbooks_connection(config: dict[str, Any]) -> dict[str, Any]:
+    """Test QuickBooks connection"""
+    # Mock implementation
+    await asyncio.sleep(1)
+    return {
+        "status": "success",
+        "message": "Connection successful",
+        "capabilities": ["customers", "vendors", "items", "transactions"],
+    }
+
+
+async def _test_freeagent_connection(config: dict[str, Any]) -> dict[str, Any]:
+    """Test FreeAgent connection"""
+    # Mock implementation
+    await asyncio.sleep(1)
+    return {
+        "status": "success",
+        "message": "Connection successful",
+        "capabilities": ["contacts", "projects", "invoices", "expenses"],
+    }
+
+
+async def _test_kashflow_connection(config: dict[str, Any]) -> dict[str, Any]:
+    """Test KashFlow connection"""
+    # Mock implementation
+    await asyncio.sleep(1)
+    return {
+        "status": "success",
+        "message": "Connection successful",
+        "capabilities": ["customers", "suppliers", "invoices", "receipts"],
+    }
+
+
+async def _sync_iris_data(
+    config: dict[str, Any], sync_type: str, tenant_id: str
+) -> dict[str, Any]:
+    """Sync data from IRIS"""
+    # Mock implementation
+    await asyncio.sleep(2)
+
+    # Simulate syncing client data
+    mock_clients = [
+        {"id": "client_1", "name": "John Doe", "utr": "1234567890"},
+        {"id": "client_2", "name": "Jane Smith", "utr": "0987654321"},
+    ]
+
+    entities_created = 0
+    for client in mock_clients:
+        # Create taxpayer profile in KG
+        taxpayer_properties = {
+            "taxpayer_id": client["id"],
+            "name": client["name"],
+            "utr": client["utr"],
+            "tenant_id": tenant_id,
+            "source": "iris_sync",
+            "extractor_version": "1.0.0",
+            "valid_from": datetime.utcnow(),
+            "asserted_at": datetime.utcnow(),
+        }
+
+        await neo4j_client.create_node("TaxpayerProfile", taxpayer_properties) # fmt: skip # pyright: ignore[reportOptionalMemberAccess]
+        entities_created += 1
+
+    return {
+        "records_synced": len(mock_clients),
+        "entities_created": entities_created,
+        "sync_type": sync_type,
+    }
+
+
+async def _sync_sage_data(
+    config: dict[str, Any], sync_type: str, tenant_id: str
+) -> dict[str, Any]:
+    """Sync data from Sage"""
+    # Mock implementation
+    await asyncio.sleep(2)
+    return {"records_synced": 5, "entities_created": 5, "sync_type": sync_type}
+
+
+async def _sync_xero_data(
+    config: dict[str, Any], sync_type: str, tenant_id: str
+) -> dict[str, Any]:
+    """Sync data from Xero"""
+    # Mock implementation
+    await asyncio.sleep(2)
+    return {"records_synced": 8, "entities_created": 8, "sync_type": sync_type}
+
+
+async def _sync_quickbooks_data(
+    config: dict[str, Any], sync_type: str, tenant_id: str
+) -> dict[str, Any]:
+    """Sync data from QuickBooks"""
+    # Mock implementation
+    await asyncio.sleep(2)
+    return {"records_synced": 12, "entities_created": 12, "sync_type": sync_type}
+
+
+async def _sync_freeagent_data(
+    config: dict[str, Any], sync_type: str, tenant_id: str
+) -> dict[str, Any]:
+    """Sync data from FreeAgent"""
+    # Mock implementation
+    await asyncio.sleep(2)
+    return {"records_synced": 6, "entities_created": 6, "sync_type": sync_type}
+
+
+async def _sync_kashflow_data(
+    config: dict[str, Any], sync_type: str, tenant_id: str
+) -> dict[str, Any]:
+    """Sync data from KashFlow"""
+    # Mock implementation
+    await asyncio.sleep(2)
+    return {"records_synced": 4, "entities_created": 4, "sync_type": sync_type}
+
+
+def _get_system_name(system: str) -> str:
+    """Get human-readable system name"""
+    names = {
+        "iris": "IRIS Practice Management",
+        "sage": "Sage Practice Management",
+        "xero": "Xero",
+        "quickbooks": "QuickBooks",
+        "freeagent": "FreeAgent",
+        "kashflow": "KashFlow",
+    }
+    return names.get(system, system.title())
+
+
+def _get_system_capabilities(system: str) -> list[str]:
+    """Get system capabilities"""
+    capabilities = {
+        "iris": ["clients", "jobs", "documents", "time_tracking"],
+        "sage": ["customers", "suppliers", "transactions", "reports"],
+        "xero": ["contacts", "invoices", "bank_transactions", "reports"],
+        "quickbooks": ["customers", "vendors", "items", "transactions", "reports"],
+        "freeagent": ["contacts", "projects", "invoices", "expenses", "time_tracking"],
+        "kashflow": ["customers", "suppliers", "invoices", "receipts", "reports"],
+    }
+    return capabilities.get(system, [])
+
+
+def _get_connection_fields(system: str) -> list[dict[str, Any]]:
+    """Get required connection fields for system"""
+    fields = {
+        "iris": [
+            {
+                "name": "api_key",
+                "type": "string",
+                "required": True,
+                "description": "IRIS API Key",
+            },
+            {
+                "name": "base_url",
+                "type": "string",
+                "required": True,
+                "description": "IRIS Base URL",
+            },
+        ],
+        "sage": [
+            {
+                "name": "username",
+                "type": "string",
+                "required": True,
+                "description": "Sage Username",
+            },
+            {
+                "name": "password",
+                "type": "password",
+                "required": True,
+                "description": "Sage Password",
+            },
+            {
+                "name": "database",
+                "type": "string",
+                "required": True,
+                "description": "Database Name",
+            },
+        ],
+        "xero": [
+            {
+                "name": "client_id",
+                "type": "string",
+                "required": True,
+                "description": "Xero Client ID",
+            },
+            {
+                "name": "client_secret",
+                "type": "password",
+                "required": True,
+                "description": "Xero Client Secret",
+            },
+            {
+                "name": "tenant_id",
+                "type": "string",
+                "required": True,
+                "description": "Xero Tenant ID",
+            },
+        ],
+        "quickbooks": [
+            {
+                "name": "client_id",
+                "type": "string",
+                "required": True,
+                "description": "QuickBooks Client ID",
+            },
+            {
+                "name": "client_secret",
+                "type": "password",
+                "required": True,
+                "description": "QuickBooks Client Secret",
+            },
+            {
+                "name": "company_id",
+                "type": "string",
+                "required": True,
+                "description": "Company ID",
+            },
+        ],
+        "freeagent": [
+            {
+                "name": "client_id",
+                "type": "string",
+                "required": True,
+                "description": "FreeAgent Client ID",
+            },
+            {
+                "name": "client_secret",
+                "type": "password",
+                "required": True,
+                "description": "FreeAgent Client Secret",
+            },
+        ],
+        "kashflow": [
+            {
+                "name": "username",
+                "type": "string",
+                "required": True,
+                "description": "KashFlow Username",
+            },
+            {
+                "name": "password",
+                "type": "password",
+                "required": True,
+                "description": "KashFlow Password",
+            },
+        ],
+    }
+    return fields.get(system, [])
+
+
+async def _create_sync_record(
+    sync_id: str, system: str, sync_type: str, tenant_id: str
+) -> None:
+    """Create sync record in knowledge graph"""
+
+    sync_properties = {
+        "sync_id": sync_id,
+        "system": system,
+        "sync_type": sync_type,
+        "tenant_id": tenant_id,
+        "status": "running",
+        "started_at": datetime.utcnow().isoformat(),
+        "records_synced": 0,
+        "errors": "[]",
+        "source": "firm_connectors",
+        "extractor_version": "1.0.0",
+        "valid_from": datetime.utcnow(),
+        "asserted_at": datetime.utcnow(),
+    }
+
+    await neo4j_client.create_node("FirmSync", sync_properties) # fmt: skip # pyright: ignore[reportOptionalMemberAccess]
+
+
+async def _update_sync_record(
+    sync_id: str, status: str, result: dict[str, Any]
+) -> None:
+    """Update sync record with results"""
+
+    update_properties = {
+        "status": status,
+        "completed_at": datetime.utcnow().isoformat(),
+        "records_synced": result.get("records_synced", 0),
+        "total_records": result.get("total_records", 0),
+        "errors": json.dumps(result.get("errors", [])),
+    }
+
+    # This would update the existing node
+    # For now, just log
+    logger.debug(
+        "Sync record updated",
+        sync_id=sync_id,
+        status=status,
+        properties=update_properties,
+    )
+
+
+@app.exception_handler(HTTPException)
+async def http_exception_handler(request: Request, exc: HTTPException) -> JSONResponse:
+    """Handle HTTP exceptions with RFC7807 format"""
+    return JSONResponse(
+        status_code=exc.status_code,
+        content=ErrorResponse(
+            type=f"https://httpstatuses.com/{exc.status_code}",
+            title=exc.detail,
+            status=exc.status_code,
+            detail=exc.detail,
+            instance=str(request.url),
+            trace_id="",
+        ).model_dump(),
+    )
+
+
+if __name__ == "__main__":
+    import uvicorn
+
+    uvicorn.run("main:app", host="0.0.0.0", port=8011, reload=True, log_config=None)
--- a/apps/svc_firm_connectors/requirements.txt
+++ b/apps/svc_firm_connectors/requirements.txt
@@ -0,0 +1,45 @@
+# FastAPI and server
+fastapi>=0.104.1
+uvicorn[standard]>=0.24.0
+pydantic>=2.5.0
+
+# Service-specific dependencies
+# Database connectors
+sqlalchemy>=2.0.0
+pymssql>=2.2.0
+cx-Oracle>=8.3.0
+
+# API clients for practice management systems
+zeep>=4.2.0  # SOAP client
+xmltodict>=0.13.0
+
+# OAuth for various systems
+authlib>=1.2.0
+requests-oauthlib>=1.3.0
+
+# Data synchronization
+pandas>=2.1.0
+
+# Rate limiting
+ratelimit>=2.2.0
+
+# Retry mechanisms
+tenacity>=8.2.0
+
+# CSV processing
+csvkit>=1.1.0
+
+# Excel file processing
+openpyxl>=3.1.0
+xlrd>=2.0.0
+
+# Data validation
+marshmallow>=3.20.0
+cerberus>=1.3.4
+
+# Connection pooling (built into SQLAlchemy)
+# sqlalchemy-pool>=1.3.0  # Package doesn't exist, pooling is built into SQLAlchemy
+
+# Additional utilities
+python-dateutil>=2.8.0
+pytz>=2023.3
--- a/apps/svc_forms/Dockerfile
+++ b/apps/svc_forms/Dockerfile
@@ -0,0 +1,53 @@
+# Multi-stage build for svc_forms
+FROM python:3.12-slim AS builder
+
+# Install build dependencies
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    curl \
+    && rm -rf /var/lib/apt/lists/*
+
+# Create virtual environment
+RUN python -m venv /opt/venv
+ENV PATH="/opt/venv/bin:$PATH"
+
+# Copy requirements and install dependencies
+COPY libs/requirements-base.txt /tmp/libs-requirements.txt
+COPY apps/svc_forms/requirements.txt /tmp/requirements.txt
+RUN pip install --no-cache-dir --upgrade pip && \
+    pip install --no-cache-dir -r /tmp/libs-requirements.txt -r /tmp/requirements.txt
+
+# Production stage
+FROM python:3.12-slim
+
+# Install runtime dependencies
+RUN apt-get update && apt-get install -y \
+    curl \
+    && rm -rf /var/lib/apt/lists/* \
+    && groupadd -r appuser \
+    && useradd -r -g appuser appuser
+
+# Copy virtual environment from builder
+COPY --from=builder /opt/venv /opt/venv
+ENV PATH="/opt/venv/bin:$PATH"
+
+# Set working directory
+WORKDIR /app
+
+# Copy application code
+COPY libs/ ./libs/
+COPY apps/svc_forms/ ./apps/svc_forms/
+
+# Create non-root user and set permissions
+RUN chown -R appuser:appuser /app
+USER appuser
+
+# Health check
+HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
+    CMD curl -f http://localhost:8000/healthz || exit 1
+
+# Expose port
+EXPOSE 8000
+
+# Run the application
+CMD ["python", "-m", "uvicorn", "apps.svc_forms.main:app", "--host", "0.0.0.0", "--port", "8000"]
--- a/apps/svc_forms/main.py
+++ b/apps/svc_forms/main.py
@@ -0,0 +1,625 @@
+"""PDF form filling with evidence pack generation."""
+
+# FILE: apps/svc-forms/main.py
+# pylint: disable=wrong-import-position,import-error,too-few-public-methods,global-statement
+# pylint: disable=global-variable-not-assigned,raise-missing-from,unused-argument
+# pylint: disable=broad-exception-caught,no-else-return,too-many-arguments,too-many-positional-arguments
+# pylint: disable=too-many-locals,import-outside-toplevel
+# mypy: disable-error-code=union-attr
+
+
+import os
+
+# Import shared libraries
+import sys
+from datetime import datetime
+from io import BytesIO
+from typing import Any
+
+import structlog
+import ulid
+from fastapi import BackgroundTasks, Depends, HTTPException, Request
+from fastapi.responses import JSONResponse, Response
+
+sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
+
+from libs.app_factory import create_app
+from libs.config import (
+    BaseAppSettings,
+    create_event_bus,
+    create_minio_client,
+    create_neo4j_client,
+)
+from libs.events import EventBus, EventPayload, EventTopics
+from libs.forms import UK_TAX_FORMS, EvidencePackGenerator, PDFFormFiller
+from libs.neo import Neo4jClient
+from libs.observability import get_metrics, get_tracer, setup_observability
+from libs.schemas import ErrorResponse
+from libs.security import get_current_user, get_tenant_id
+from libs.storage import DocumentStorage, StorageClient
+
+logger = structlog.get_logger()
+
+
+class FormsSettings(BaseAppSettings):
+    """Settings for forms service"""
+
+    service_name: str = "svc-forms"
+
+    # Form templates
+    forms_template_dir: str = "forms/templates"
+    output_bucket: str = "filled-forms"
+    evidence_packs_bucket: str = "evidence-packs"
+
+    # Supported forms
+    supported_forms: list[str] = ["SA100", "SA103", "SA105", "SA106"]
+
+    # PDF configuration
+    pdf_quality: str = "high"
+    flatten_forms: bool = True
+
+
+# Create app and settings
+app, settings = create_app(
+    service_name="svc-forms",
+    title="Tax Agent Forms Service",
+    description="PDF form filling and evidence pack generation",
+    settings_class=FormsSettings,
+)
+
+# Global clients
+storage_client: StorageClient | None = None
+document_storage: DocumentStorage | None = None
+neo4j_client: Neo4jClient | None = None
+pdf_form_filler: PDFFormFiller | None = None
+evidence_pack_generator: EvidencePackGenerator | None = None
+event_bus: EventBus | None = None
+tracer = get_tracer("svc-forms")
+metrics = get_metrics()
+
+
+@app.on_event("startup")
+async def startup_event() -> None:
+    """Initialize service dependencies"""
+    global storage_client, document_storage, neo4j_client, pdf_form_filler  # pylint: disable=line-too-long
+    global evidence_pack_generator, event_bus
+
+    logger.info("Starting forms service")
+
+    # Setup observability
+    setup_observability(settings)
+
+    # Initialize MinIO client
+    minio_client = create_minio_client(settings)
+    storage_client = StorageClient(minio_client)
+    document_storage = DocumentStorage(storage_client)
+
+    # Initialize Neo4j client
+    neo4j_driver = create_neo4j_client(settings)
+    neo4j_client = Neo4jClient(neo4j_driver)
+
+    # Initialize PDF form filler
+    pdf_form_filler = PDFFormFiller()
+
+    # Load form templates
+    for form_id in settings.supported_forms:
+        template_path = os.path.join(settings.forms_template_dir, f"{form_id}.pdf")
+        if os.path.exists(template_path):
+            pdf_form_filler.load_template(form_id, template_path)
+        else:
+            logger.warning(
+                "Form template not found", form_id=form_id, path=template_path
+            )
+
+    # Initialize evidence pack generator
+    evidence_pack_generator = EvidencePackGenerator(storage_client)
+
+    # Initialize event bus
+    event_bus = create_event_bus(settings)
+    await event_bus.start() # fmt: skip # pyright: ignore[reportOptionalMemberAccess]
+
+    # Subscribe to calculation completion events
+    await event_bus.subscribe(  # fmt: skip # pyright: ignore[reportOptionalMemberAccess]
+        EventTopics.CALC_SCHEDULE_READY, _handle_calculation_ready
+    )
+
+    # Ensure buckets exist
+    await storage_client.ensure_bucket(settings.output_bucket)
+    await storage_client.ensure_bucket(settings.evidence_packs_bucket)
+
+    logger.info("Forms service started successfully")
+
+
+@app.on_event("shutdown")
+async def shutdown_event() -> None:
+    """Cleanup service dependencies"""
+    global neo4j_client, event_bus
+
+    logger.info("Shutting down forms service")
+
+    if neo4j_client:
+        await neo4j_client.close()
+
+    if event_bus:
+        await event_bus.stop()
+
+    logger.info("Forms service shutdown complete")
+
+
+@app.get("/health")
+async def health_check() -> dict[str, Any]:
+    """Health check endpoint"""
+    return {
+        "status": "healthy",
+        "service": settings.service_name,
+        "version": "1.0.0",
+        "timestamp": datetime.now().isoformat(),
+        "supported_forms": settings.supported_forms,
+    }
+
+
+@app.post("/fill/{form_id}")
+async def fill_form(
+    form_id: str,
+    field_values: dict[str, Any],
+    background_tasks: BackgroundTasks,
+    current_user: dict[str, Any] = Depends(get_current_user),
+    tenant_id: str = Depends(get_tenant_id),
+) -> dict[str, Any]:
+    """Fill PDF form with provided values"""
+
+    with tracer.start_as_current_span("fill_form") as span:
+        span.set_attribute("form_id", form_id)
+        span.set_attribute("tenant_id", tenant_id)
+        span.set_attribute("field_count", len(field_values))
+
+        try:
+            # Validate form ID
+            if form_id not in settings.supported_forms:
+                raise HTTPException(
+                    status_code=400, detail=f"Unsupported form: {form_id}"
+                )
+
+            # Generate filling ID
+            filling_id = str(ulid.new())
+            span.set_attribute("filling_id", filling_id)
+
+            # Start background form filling
+            background_tasks.add_task(
+                _fill_form_async,
+                form_id,
+                field_values,
+                tenant_id,
+                filling_id,
+                current_user.get("sub", "system"),
+            )
+
+            logger.info("Form filling started", form_id=form_id, filling_id=filling_id)
+
+            return {
+                "filling_id": filling_id,
+                "form_id": form_id,
+                "status": "filling",
+                "field_count": len(field_values),
+            }
+
+        except HTTPException:
+            raise
+        except Exception as e:
+            logger.error("Failed to start form filling", form_id=form_id, error=str(e))
+            raise HTTPException(status_code=500, detail="Failed to start form filling")
+
+
+@app.post("/fill-from-calculation/{calculation_id}")
+async def fill_form_from_calculation(
+    calculation_id: str,
+    background_tasks: BackgroundTasks,
+    current_user: dict[str, Any] = Depends(get_current_user),
+    tenant_id: str = Depends(get_tenant_id),
+) -> dict[str, Any]:
+    """Fill form using calculation results"""
+
+    with tracer.start_as_current_span("fill_form_from_calculation") as span:
+        span.set_attribute("calculation_id", calculation_id)
+        span.set_attribute("tenant_id", tenant_id)
+
+        try:
+            # Get calculation from Neo4j
+            calc_query = """
+            MATCH (c:Calculation {calculation_id: $calculation_id, tenant_id: $tenant_id})
+            WHERE c.retracted_at IS NULL
+            RETURN c
+            """
+
+            calc_results = await neo4j_client.run_query(  # pyright: ignore[reportOptionalMemberAccess]
+                calc_query, {"calculation_id": calculation_id, "tenant_id": tenant_id}
+            )
+
+            if not calc_results:
+                raise HTTPException(status_code=404, detail="Calculation not found")
+
+            calculation = calc_results[0]["c"]
+            form_id = calculation.get("schedule")
+
+            if not form_id:
+                raise HTTPException(
+                    status_code=400, detail="No schedule found in calculation"
+                )
+
+            # Get form boxes
+            boxes_query = """
+            MATCH (c:Calculation {calculation_id: $calculation_id})-[:HAS_BOX]->(b:FormBox)
+            WHERE c.retracted_at IS NULL AND b.retracted_at IS NULL
+            RETURN b
+            """
+
+            box_results = await neo4j_client.run_query(  # pyright: ignore[reportOptionalMemberAccess]
+                boxes_query, {"calculation_id": calculation_id}
+            )
+
+            # Convert form boxes to field values
+            field_values = {}
+            for box_result in box_results:
+                box = box_result["b"]
+                field_values[f"box_{box['box']}"] = box["value"]
+
+            # Generate filling ID
+            filling_id = str(ulid.new())
+            span.set_attribute("filling_id", filling_id)
+            span.set_attribute("form_id", form_id)
+
+            # Start background form filling
+            background_tasks.add_task(
+                _fill_form_async,
+                form_id,
+                field_values,
+                tenant_id,
+                filling_id,
+                current_user.get("sub", "system"),
+                calculation_id,
+            )
+
+            logger.info(
+                "Form filling from calculation started",
+                form_id=form_id,
+                filling_id=filling_id,
+                calculation_id=calculation_id,
+            )
+
+            return {
+                "filling_id": filling_id,
+                "form_id": form_id,
+                "calculation_id": calculation_id,
+                "status": "filling",
+                "field_count": len(field_values),
+            }
+
+        except HTTPException:
+            raise
+        except Exception as e:
+            logger.error(
+                "Failed to fill form from calculation",
+                calculation_id=calculation_id,
+                error=str(e),
+            )
+            raise HTTPException(
+                status_code=500, detail="Failed to fill form from calculation"
+            )
+
+
+@app.get("/download/{filling_id}")
+async def download_filled_form(
+    filling_id: str,
+    current_user: dict[str, Any] = Depends(get_current_user),
+    tenant_id: str = Depends(get_tenant_id),
+) -> Response:
+    """Download filled form"""
+
+    with tracer.start_as_current_span("download_filled_form") as span:
+        span.set_attribute("filling_id", filling_id)
+        span.set_attribute("tenant_id", tenant_id)
+
+        try:
+            # Get filled form from storage
+            object_key = f"tenants/{tenant_id}/filled/{filling_id}.pdf"
+
+            form_content = await storage_client.get_object(  # pyright: ignore[reportOptionalMemberAccess]
+                settings.output_bucket, object_key
+            )
+
+            if not form_content:
+                raise HTTPException(status_code=404, detail="Filled form not found")
+
+            return Response(
+                content=form_content,
+                media_type="application/pdf",
+                headers={
+                    "Content-Disposition": f"attachment; filename={filling_id}.pdf"
+                },
+            )
+
+        except HTTPException:
+            raise
+        except Exception as e:
+            logger.error(
+                "Failed to download filled form", filling_id=filling_id, error=str(e)
+            )
+            raise HTTPException(
+                status_code=500, detail="Failed to download filled form"
+            )
+
+
+@app.post("/evidence-pack")
+async def create_evidence_pack(
+    taxpayer_id: str,
+    tax_year: str,
+    scope: str,
+    evidence_items: list[dict[str, Any]],
+    background_tasks: BackgroundTasks,
+    current_user: dict[str, Any] = Depends(get_current_user),
+    tenant_id: str = Depends(get_tenant_id),
+) -> dict[str, Any]:
+    """Create evidence pack with supporting documents"""
+
+    with tracer.start_as_current_span("create_evidence_pack") as span:
+        span.set_attribute("taxpayer_id", taxpayer_id)
+        span.set_attribute("tax_year", tax_year)
+        span.set_attribute("scope", scope)
+        span.set_attribute("tenant_id", tenant_id)
+        span.set_attribute("evidence_count", len(evidence_items))
+
+        try:
+            # Generate pack ID
+            pack_id = str(ulid.new())
+            span.set_attribute("pack_id", pack_id)
+
+            # Start background pack creation
+            background_tasks.add_task(
+                _create_evidence_pack_async,
+                taxpayer_id,
+                tax_year,
+                scope,
+                evidence_items,
+                tenant_id,
+                pack_id,
+                current_user.get("sub", "system"),
+            )
+
+            logger.info(
+                "Evidence pack creation started",
+                pack_id=pack_id,
+                taxpayer_id=taxpayer_id,
+                scope=scope,
+            )
+
+            return {
+                "pack_id": pack_id,
+                "taxpayer_id": taxpayer_id,
+                "tax_year": tax_year,
+                "scope": scope,
+                "status": "creating",
+                "evidence_count": len(evidence_items),
+            }
+
+        except Exception as e:
+            logger.error("Failed to start evidence pack creation", error=str(e))
+            raise HTTPException(
+                status_code=500, detail="Failed to start evidence pack creation"
+            )
+
+
+@app.get("/forms")
+async def list_supported_forms(
+    current_user: dict[str, Any] = Depends(get_current_user),
+    tenant_id: str = Depends(get_tenant_id),
+) -> dict[str, Any]:
+    """List supported forms with field information"""
+
+    try:
+        forms_info = []
+
+        for form_id in settings.supported_forms:
+            form_config = UK_TAX_FORMS.get(form_id, {})
+
+            # Get form fields if template is loaded
+            fields = []
+            if pdf_form_filler and form_id in pdf_form_filler.form_templates:
+                fields = pdf_form_filler.get_form_fields(form_id)
+
+            forms_info.append(
+                {
+                    "form_id": form_id,
+                    "name": form_config.get("name", form_id),
+                    "template_available": form_id
+                    in (pdf_form_filler.form_templates if pdf_form_filler else {}),
+                    "field_count": len(fields),
+                    "fields": fields[:10],  # Limit to first 10 fields for overview
+                }
+            )
+
+        return {"supported_forms": forms_info, "total_forms": len(forms_info)}
+
+    except Exception as e:
+        logger.error("Failed to list forms", error=str(e))
+        raise HTTPException(status_code=500, detail="Failed to list forms")
+
+
+async def _handle_calculation_ready(topic: str, payload: EventPayload) -> None:
+    """Handle calculation completion events for auto-form filling"""
+    try:
+        data = payload.data
+        calculation_id = data.get("calculation_id")
+        schedule = data.get("schedule")
+        tenant_id = data.get("tenant_id")
+
+        if not calculation_id or not schedule or not tenant_id:
+            logger.warning("Invalid calculation ready event", data=data)
+            return
+
+        logger.info(
+            "Auto-filling form from calculation",
+            calculation_id=calculation_id,
+            schedule=schedule,
+        )
+
+        # Get form boxes from event data
+        form_boxes = data.get("form_boxes", {})
+
+        # Convert to field values
+        field_values = {}
+        for box_id, box_data in form_boxes.items():
+            field_values[f"box_{box_id}"] = box_data.get("value")
+
+        await _fill_form_async(
+            form_id=schedule,
+            field_values=field_values,
+            tenant_id=tenant_id,
+            filling_id=str(ulid.new()),
+            actor=payload.actor,
+            calculation_id=calculation_id,
+        )
+
+    except Exception as e:
+        logger.error("Failed to handle calculation ready event", error=str(e))
+
+
+async def _fill_form_async(
+    form_id: str,
+    field_values: dict[str, Any],
+    tenant_id: str,
+    filling_id: str,
+    actor: str,
+    calculation_id: str | None = None,
+) -> None:
+    """Fill form asynchronously"""
+
+    with tracer.start_as_current_span("fill_form_async") as span:
+        span.set_attribute("form_id", form_id)
+        span.set_attribute("filling_id", filling_id)
+        span.set_attribute("tenant_id", tenant_id)
+
+        try:
+            # Fill the form
+            filled_pdf = pdf_form_filler.fill_form(form_id, field_values) # fmt: skip # pyright: ignore[reportOptionalMemberAccess]
+
+            if not filled_pdf:
+                # pylint: disable-next=broad-exception-raised
+                raise Exception("Form filling failed")
+
+            # Store filled form
+            object_key = f"tenants/{tenant_id}/filled/{filling_id}.pdf"
+
+            success = await storage_client.put_object(  # fmt: skip # pyright: ignore[reportOptionalMemberAccess]
+                bucket_name=settings.output_bucket,
+                object_name=object_key,
+                data=BytesIO(filled_pdf),
+                length=len(filled_pdf),
+                content_type="application/pdf",
+                metadata={
+                    "form_id": form_id,
+                    "filling_id": filling_id,
+                    "tenant_id": tenant_id,
+                    "calculation_id": calculation_id or "",
+                    "filled_at": datetime.utcnow().isoformat(),
+                },
+            )
+
+            if not success:
+                # pylint: disable-next=broad-exception-raised
+                raise Exception("Failed to store filled form")
+
+            # Update metrics
+            metrics.counter("forms_filled_total").labels(
+                tenant_id=tenant_id, form_id=form_id
+            ).inc()
+
+            # Publish completion event
+            event_payload = EventPayload(
+                data={
+                    "filling_id": filling_id,
+                    "form_id": form_id,
+                    "tenant_id": tenant_id,
+                    "calculation_id": calculation_id,
+                    "s3_url": f"s3://{settings.output_bucket}/{object_key}",
+                    "field_count": len(field_values),
+                },
+                actor=actor,
+                tenant_id=tenant_id,
+            )
+
+            await event_bus.publish(EventTopics.FORM_FILLED, event_payload) # fmt: skip # pyright: ignore[reportOptionalMemberAccess]
+
+            logger.info(
+                "Form filling completed", filling_id=filling_id, form_id=form_id
+            )
+
+        except Exception as e:
+            logger.error("Form filling failed", filling_id=filling_id, error=str(e))
+
+            # Update error metrics
+            metrics.counter("form_filling_errors_total").labels(
+                tenant_id=tenant_id, form_id=form_id, error_type=type(e).__name__
+            ).inc()
+
+
+async def _create_evidence_pack_async(
+    taxpayer_id: str,
+    tax_year: str,
+    scope: str,
+    evidence_items: list[dict[str, Any]],
+    tenant_id: str,
+    pack_id: str,
+    actor: str,
+) -> None:
+    """Create evidence pack asynchronously"""
+
+    with tracer.start_as_current_span("create_evidence_pack_async") as span:
+        span.set_attribute("pack_id", pack_id)
+        span.set_attribute("taxpayer_id", taxpayer_id)
+        span.set_attribute("scope", scope)
+
+        try:
+            # Create evidence pack
+            pack_result = await evidence_pack_generator.create_evidence_pack(  # fmt: skip # pyright: ignore[reportOptionalMemberAccess]
+                taxpayer_id=taxpayer_id,
+                tax_year=tax_year,
+                scope=scope,
+                evidence_items=evidence_items,
+            )
+
+            # Update metrics
+            metrics.counter("evidence_packs_created_total").labels(
+                tenant_id=tenant_id, scope=scope
+            ).inc()
+
+            logger.info(
+                "Evidence pack created",
+                pack_id=pack_id,
+                pack_size=pack_result["pack_size"],
+                evidence_count=pack_result["evidence_count"],
+            )
+
+        except Exception as e:
+            logger.error("Evidence pack creation failed", pack_id=pack_id, error=str(e))
+
+
+@app.exception_handler(HTTPException)
+async def http_exception_handler(request: Request, exc: HTTPException) -> JSONResponse:
+    """Handle HTTP exceptions with RFC7807 format"""
+    return JSONResponse(
+        status_code=exc.status_code,
+        content=ErrorResponse(
+            type=f"https://httpstatuses.com/{exc.status_code}",
+            title=exc.detail,
+            status=exc.status_code,
+            detail=exc.detail,
+            instance=str(request.url),
+            trace_id="",
+        ).model_dump(),
+    )
+
+
+if __name__ == "__main__":
+    import uvicorn
+
+    uvicorn.run("main:app", host="0.0.0.0", port=8009, reload=True, log_config=None)
--- a/apps/svc_forms/requirements.txt
+++ b/apps/svc_forms/requirements.txt
@@ -0,0 +1,37 @@
+# FastAPI and server
+fastapi>=0.104.1
+uvicorn[standard]>=0.24.0
+pydantic>=2.5.0
+
+# Service-specific dependencies
+# PDF form filling
+pdfrw>=0.4
+reportlab>=4.0.0
+
+# PDF processing
+PyPDF2>=3.0.0
+pypdf>=3.17.0
+
+# Image processing for overlays
+Pillow>=10.1.0
+
+# ZIP file creation for evidence packs
+zipfile36>=0.1.3
+
+# Template processing
+jinja2>=3.1.0
+
+# QR code generation
+qrcode>=7.4.0
+
+# Barcode generation
+python-barcode>=0.15.0
+
+# Font handling
+fonttools>=4.44.0
+
+# Additional PDF utilities
+pdfminer.six>=20231228
+
+# Document conversion
+python-docx>=1.1.0
--- a/apps/svc_hmrc/Dockerfile
+++ b/apps/svc_hmrc/Dockerfile
@@ -0,0 +1,54 @@
+# Multi-stage build for svc_hmrc
+FROM python:3.12-slim AS builder
+
+# Install build dependencies
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    curl \
+    && rm -rf /var/lib/apt/lists/*
+
+# Create virtual environment
+RUN python -m venv /opt/venv
+ENV PATH="/opt/venv/bin:$PATH"
+
+# Copy requirements and install dependencies
+COPY libs/requirements-base.txt /tmp/libs-requirements.txt
+COPY apps/svc_hmrc/requirements.txt /tmp/requirements.txt
+RUN pip install --no-cache-dir --upgrade pip && \
+    pip install --no-cache-dir -r /tmp/libs-requirements.txt -r /tmp/requirements.txt
+
+
+# Production stage
+FROM python:3.12-slim
+
+# Install runtime dependencies
+RUN apt-get update && apt-get install -y \
+    curl \
+    && rm -rf /var/lib/apt/lists/* \
+    && groupadd -r appuser \
+    && useradd -r -g appuser appuser
+
+# Copy virtual environment from builder
+COPY --from=builder /opt/venv /opt/venv
+ENV PATH="/opt/venv/bin:$PATH"
+
+# Set working directory
+WORKDIR /app
+
+# Copy application code
+COPY libs/ ./libs/
+COPY apps/svc_hmrc/ ./apps/svc_hmrc/
+
+# Create non-root user and set permissions
+RUN chown -R appuser:appuser /app
+USER appuser
+
+# Health check
+HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
+    CMD curl -f http://localhost:8000/healthz || exit 1
+
+# Expose port
+EXPOSE 8000
+
+# Run the application
+CMD ["python", "-m", "uvicorn", "apps.svc_hmrc.main:app", "--host", "0.0.0.0", "--port", "8000"]
--- a/apps/svc_hmrc/main.py
+++ b/apps/svc_hmrc/main.py
@@ -0,0 +1,759 @@
+# FILE: apps/svc-hmrc/main.py
+
+# HMRC submission service with MTD API integration and validation
+
+import asyncio
+import json
+import os
+
+# Import shared libraries
+import sys
+from datetime import datetime
+from typing import Any
+
+import structlog
+import ulid
+from fastapi import BackgroundTasks, Depends, HTTPException, Request
+from fastapi.responses import JSONResponse
+
+sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
+
+from libs.app_factory import create_app
+from libs.config import (
+    BaseAppSettings,
+    create_event_bus,
+    create_neo4j_client,
+    create_vault_client,
+)
+from libs.events import EventBus, EventPayload, EventTopics
+from libs.neo import Neo4jClient
+from libs.observability import get_metrics, get_tracer, setup_observability
+from libs.schemas import ErrorResponse, HMRCSubmissionRequest, HMRCSubmissionResponse
+from libs.security import VaultTransitHelper, get_current_user, get_tenant_id
+
+logger = structlog.get_logger()
+
+
+class HMRCSettings(BaseAppSettings):
+    """Settings for HMRC service"""
+
+    service_name: str = "svc-hmrc"
+
+    # HMRC API configuration
+    hmrc_base_url: str = "https://api.service.hmrc.gov.uk"
+    hmrc_sandbox_url: str = "https://test-api.service.hmrc.gov.uk"
+    use_sandbox: bool = True
+
+    # OAuth configuration
+    client_id: str = ""
+    client_secret: str = ""
+    redirect_uri: str = "http://localhost:8000/oauth/callback"
+
+    # API endpoints
+    mtd_income_tax_endpoint: str = (
+        "/income-tax/self-assessment/ni/{nino}/uk-property/{taxYear}"
+    )
+    mtd_self_employment_endpoint: str = (
+        "/income-tax/self-assessment/ni/{nino}/self-employment/{businessId}"
+    )
+
+    # Validation
+    max_submission_retries: int = 3
+    submission_timeout: int = 300  # 5 minutes
+
+
+# Create app and settings
+app, settings = create_app(
+    service_name="svc-hmrc",
+    title="Tax Agent HMRC Service",
+    description="HMRC submission service with MTD API integration",
+    settings_class=HMRCSettings,
+)
+
+# Global clients
+vault_helper: VaultTransitHelper | None = None
+neo4j_client: Neo4jClient | None = None
+event_bus: EventBus | None = None
+tracer = get_tracer("svc-hmrc")
+metrics = get_metrics()
+
+
+@app.on_event("startup")
+async def startup_event() -> None:
+    """Initialize service dependencies"""
+    global vault_helper, neo4j_client, event_bus
+
+    logger.info("Starting HMRC service")
+
+    # Setup observability
+    setup_observability(settings)
+
+    # Initialize Vault helper
+    vault_client = create_vault_client(settings)
+    vault_helper = VaultTransitHelper(vault_client, "tax-agent-transit")
+
+    # Initialize Neo4j client
+    neo4j_driver = create_neo4j_client(settings)
+    neo4j_client = Neo4jClient(neo4j_driver)
+
+    # Initialize event bus
+    event_bus = create_event_bus(settings)
+    if not event_bus:
+        raise Exception("Event bus not initialized")
+
+    await event_bus.start()
+
+    # Subscribe to form completion events
+    await event_bus.subscribe(EventTopics.FORM_FILLED, _handle_form_filled)  # type: ignore
+
+    logger.info("HMRC service started successfully")
+
+
+@app.on_event("shutdown")
+async def shutdown_event() -> None:
+    """Cleanup service dependencies"""
+    global neo4j_client, event_bus
+
+    logger.info("Shutting down HMRC service")
+
+    if neo4j_client:
+        await neo4j_client.close()
+
+    if event_bus:
+        await event_bus.stop()
+
+    logger.info("HMRC service shutdown complete")
+
+
+@app.get("/health")
+async def health_check() -> dict[str, Any]:
+    """Health check endpoint"""
+    return {
+        "status": "healthy",
+        "service": settings.service_name,
+        "version": settings.service_version,
+        "timestamp": datetime.utcnow().isoformat(),
+        "hmrc_environment": "sandbox" if settings.use_sandbox else "production",
+    }
+
+
+@app.post("/submit", response_model=HMRCSubmissionResponse)
+async def submit_to_hmrc(
+    request_data: HMRCSubmissionRequest,
+    background_tasks: BackgroundTasks,
+    current_user: dict[str, Any] = Depends(get_current_user),
+    tenant_id: str = Depends(get_tenant_id),
+) -> HMRCSubmissionResponse:
+    """Submit tax return to HMRC"""
+
+    with tracer.start_as_current_span("submit_to_hmrc") as span:
+        span.set_attribute("tax_year", request_data.tax_year)
+        span.set_attribute("taxpayer_id", request_data.taxpayer_id)
+        span.set_attribute("tenant_id", tenant_id)
+        span.set_attribute("dry_run", request_data.dry_run)
+
+        try:
+            # Generate submission ID
+            submission_id = str(ulid.new())
+            span.set_attribute("submission_id", submission_id)
+
+            # Start background submission
+            background_tasks.add_task(
+                _submit_to_hmrc_async,
+                request_data.tax_year,
+                request_data.taxpayer_id,
+                request_data.dry_run,
+                tenant_id,
+                submission_id,
+                current_user.get("sub", "system"),
+            )
+
+            logger.info(
+                "HMRC submission started",
+                submission_id=submission_id,
+                taxpayer_id=request_data.taxpayer_id,
+                dry_run=request_data.dry_run,
+            )
+
+            return HMRCSubmissionResponse(
+                submission_id=submission_id,
+                status="processing",
+                hmrc_reference=None,
+                submission_timestamp=datetime.utcnow(),
+                validation_results={},
+                dry_run=request_data.dry_run,
+            )
+
+        except Exception as e:
+            logger.error("Failed to start HMRC submission", error=str(e))
+            raise HTTPException(
+                status_code=500, detail="Failed to start HMRC submission"
+            )
+
+
+@app.get("/submissions/{submission_id}")
+async def get_submission_status(
+    submission_id: str,
+    current_user: dict[str, Any] = Depends(get_current_user),
+    tenant_id: str = Depends(get_tenant_id),
+) -> dict[str, Any]:
+    """Get submission status"""
+
+    with tracer.start_as_current_span("get_submission_status") as span:
+        span.set_attribute("submission_id", submission_id)
+        span.set_attribute("tenant_id", tenant_id)
+
+        try:
+            # Get submission from Neo4j
+            query = """
+            MATCH (s:Submission {submission_id: $submission_id, tenant_id: $tenant_id})
+            WHERE s.retracted_at IS NULL
+            RETURN s
+            """
+
+            if not neo4j_client:
+                raise Exception("Neo4j client not initialized")
+
+            results = await neo4j_client.run_query(  # fmt: skip # pyright: ignore[reportOptionalMemberAccess]
+                query, {"submission_id": submission_id, "tenant_id": tenant_id}
+            )
+
+            if not results:
+                raise HTTPException(status_code=404, detail="Submission not found")
+
+            submission = results[0]["s"]
+
+            return {
+                "submission_id": submission_id,
+                "status": submission.get("status"),
+                "hmrc_reference": submission.get("hmrc_reference"),
+                "submission_timestamp": submission.get("submission_timestamp"),
+                "validation_results": json.loads(
+                    submission.get("validation_results", "{}")
+                ),
+                "dry_run": submission.get("dry_run", False),
+                "error_message": submission.get("error_message"),
+            }
+
+        except HTTPException:
+            raise
+        except Exception as e:
+            logger.error(
+                "Failed to get submission status",
+                submission_id=submission_id,
+                error=str(e),
+            )
+            raise HTTPException(
+                status_code=500, detail="Failed to get submission status"
+            )
+
+
+@app.post("/oauth/authorize")
+async def initiate_oauth_flow(
+    taxpayer_id: str,
+    current_user: dict[str, Any] = Depends(get_current_user),
+    tenant_id: str = Depends(get_tenant_id),
+) -> dict[str, Any]:
+    """Initiate OAuth flow for HMRC authorization"""
+
+    with tracer.start_as_current_span("initiate_oauth") as span:
+        span.set_attribute("taxpayer_id", taxpayer_id)
+        span.set_attribute("tenant_id", tenant_id)
+
+        try:
+            # Generate state parameter for security
+            state = str(ulid.new())
+
+            # Build authorization URL
+            base_url = (
+                settings.hmrc_sandbox_url
+                if settings.use_sandbox
+                else settings.hmrc_base_url
+            )
+            auth_url = f"{base_url}/oauth/authorize"
+
+            params = {
+                "response_type": "code",
+                "client_id": settings.client_id,
+                "scope": "read:self-assessment write:self-assessment",
+                "state": state,
+                "redirect_uri": settings.redirect_uri,
+            }
+
+            # Store state for validation
+            await _store_oauth_state(state, taxpayer_id, tenant_id)
+
+            # Build full URL
+            param_string = "&".join([f"{k}={v}" for k, v in params.items()])
+            full_auth_url = f"{auth_url}?{param_string}"
+
+            return {
+                "authorization_url": full_auth_url,
+                "state": state,
+                "expires_in": 600,  # 10 minutes
+            }
+
+        except Exception as e:
+            logger.error("Failed to initiate OAuth flow", error=str(e))
+            raise HTTPException(status_code=500, detail="Failed to initiate OAuth flow")
+
+
+@app.post("/oauth/callback")
+async def handle_oauth_callback(
+    code: str,
+    state: str,
+    current_user: dict[str, Any] = Depends(get_current_user),
+    tenant_id: str = Depends(get_tenant_id),
+) -> dict[str, Any]:
+    """Handle OAuth callback from HMRC"""
+
+    with tracer.start_as_current_span("handle_oauth_callback") as span:
+        span.set_attribute("state", state)
+        span.set_attribute("tenant_id", tenant_id)
+
+        if not neo4j_client:
+            raise HTTPException(status_code=500, detail="Neo4j client not initialized")
+
+        try:
+            # Validate state
+            oauth_data = await _get_oauth_state(state)
+            if not oauth_data or oauth_data.get("tenant_id") != tenant_id:
+                raise HTTPException(status_code=400, detail="Invalid state parameter")
+
+            # Exchange code for access token
+            token_data = await _exchange_code_for_token(code)
+
+            # Store encrypted tokens
+            if vault_helper is None:
+                raise HTTPException(
+                    status_code=500, detail="Vault helper not initialized"
+                )
+
+            encrypted_access_token = vault_helper.encrypt_field(
+                "hmrc-access-token", token_data["access_token"]
+            )
+            encrypted_refresh_token = vault_helper.encrypt_field(
+                "hmrc-refresh-token", token_data.get("refresh_token", "")
+            )
+
+            # Store authorization in Neo4j
+            auth_properties = {
+                "taxpayer_id": oauth_data["taxpayer_id"],
+                "tenant_id": tenant_id,
+                "access_token": encrypted_access_token,
+                "refresh_token": encrypted_refresh_token,
+                "expires_at": datetime.utcnow().timestamp()
+                + token_data.get("expires_in", 3600),
+                "scope": token_data.get("scope", ""),
+                "authorized_at": datetime.utcnow().isoformat(),
+                "source": "oauth_flow",
+                "extractor_version": "1.0.0",
+                "valid_from": datetime.utcnow(),
+                "asserted_at": datetime.utcnow(),
+            }
+
+            await neo4j_client.create_node("HMRCAuthorization", auth_properties) # fmt: skip # pyright: ignore[reportOptionalMemberAccess]
+
+            # Clean up state
+            await _delete_oauth_state(state)
+
+            return {
+                "status": "authorized",
+                "taxpayer_id": oauth_data["taxpayer_id"],
+                "scope": token_data.get("scope", ""),
+                "expires_in": token_data.get("expires_in", 3600),
+            }
+
+        except HTTPException:
+            raise
+        except Exception as e:
+            logger.error("OAuth callback failed", error=str(e))
+            raise HTTPException(status_code=500, detail="OAuth callback failed")
+
+
+async def _handle_form_filled(topic: str, payload: EventPayload) -> None:
+    """Handle form completion events for auto-submission"""
+    try:
+        if not neo4j_client:
+            raise Exception("Neo4j client not initialized")
+
+        data = payload.data
+        form_id = data.get("form_id")
+        tenant_id = data.get("tenant_id")
+        calculation_id = data.get("calculation_id")
+
+        if not form_id or not tenant_id:
+            logger.warning("Invalid form filled event", data=data)
+            return
+
+        # Only auto-submit if configured (this would be a tenant setting)
+        auto_submit = False  # Default to false for safety
+
+        if auto_submit and calculation_id:
+            logger.info(
+                "Auto-submitting form to HMRC",
+                form_id=form_id,
+                calculation_id=calculation_id,
+            )
+
+            # Get taxpayer ID from calculation
+            calc_query = """
+            MATCH (c:Calculation {calculation_id: $calculation_id})
+            WHERE c.retracted_at IS NULL
+            RETURN c.taxpayer_id as taxpayer_id, c.tax_year as tax_year
+            """
+
+            calc_results = await neo4j_client.run_query(  # fmt: skip # pyright: ignore[reportOptionalMemberAccess]
+                calc_query, {"calculation_id": calculation_id}
+            )
+
+            if calc_results:
+                taxpayer_id = calc_results[0]["taxpayer_id"]
+                tax_year = calc_results[0]["tax_year"]
+
+                await _submit_to_hmrc_async(
+                    tax_year=tax_year,
+                    taxpayer_id=taxpayer_id,
+                    dry_run=True,  # Always dry run for auto-submission
+                    tenant_id=tenant_id,
+                    submission_id=str(ulid.new()),
+                    actor=payload.actor,
+                )
+
+    except Exception as e:
+        logger.error("Failed to handle form filled event", error=str(e))
+
+
+async def _submit_to_hmrc_async(
+    tax_year: str,
+    taxpayer_id: str,
+    dry_run: bool,
+    tenant_id: str,
+    submission_id: str,
+    actor: str,
+) -> None:
+    """Submit to HMRC asynchronously"""
+
+    with tracer.start_as_current_span("submit_to_hmrc_async") as span:
+        span.set_attribute("submission_id", submission_id)
+        span.set_attribute("taxpayer_id", taxpayer_id)
+        span.set_attribute("dry_run", dry_run)
+
+        if not event_bus:
+            raise Exception("Event bus not initialized")
+
+        try:
+            # Get taxpayer data
+            taxpayer_data = await _get_taxpayer_data(taxpayer_id, tenant_id)
+
+            # Get calculation data
+            calculation_data = await _get_latest_calculation(
+                taxpayer_id, tax_year, tenant_id
+            )
+
+            # Validate data
+            validation_results = await _validate_submission_data(
+                taxpayer_data, calculation_data
+            )
+
+            # Prepare submission
+            submission_data = await _prepare_submission_data(
+                taxpayer_data, calculation_data, tax_year
+            )
+
+            # Submit to HMRC (or simulate if dry run)
+            if dry_run:
+                hmrc_response = await _simulate_hmrc_submission(submission_data)
+            else:
+                hmrc_response = await _submit_to_hmrc_api(
+                    submission_data, taxpayer_id, tenant_id
+                )
+
+            # Store submission record
+            await _store_submission_record(
+                submission_id,
+                taxpayer_id,
+                tax_year,
+                tenant_id,
+                hmrc_response,
+                validation_results,
+                dry_run,
+            )
+
+            # Update metrics
+            metrics.counter("hmrc_submissions_total").labels(
+                tenant_id=tenant_id,
+                dry_run=str(dry_run),
+                status=hmrc_response.get("status", "unknown"),
+            ).inc()
+
+            # Publish completion event
+            event_payload = EventPayload(
+                data={
+                    "submission_id": submission_id,
+                    "taxpayer_id": taxpayer_id,
+                    "tax_year": tax_year,
+                    "tenant_id": tenant_id,
+                    "status": hmrc_response.get("status"),
+                    "hmrc_reference": hmrc_response.get("reference"),
+                    "dry_run": dry_run,
+                },
+                actor=actor,
+                tenant_id=tenant_id,
+            )
+
+            await event_bus.publish(EventTopics.HMRC_SUBMITTED, event_payload) # fmt: skip # pyright: ignore[reportOptionalMemberAccess]
+
+            logger.info(
+                "HMRC submission completed",
+                submission_id=submission_id,
+                status=hmrc_response.get("status"),
+                dry_run=dry_run,
+            )
+
+        except Exception as e:
+            logger.error(
+                "HMRC submission failed", submission_id=submission_id, error=str(e)
+            )
+
+            # Store error record
+            await _store_submission_error(submission_id, str(e), tenant_id)
+
+            # Update error metrics
+            metrics.counter("hmrc_submission_errors_total").labels(
+                tenant_id=tenant_id, error_type=type(e).__name__
+            ).inc()
+
+
+async def _get_taxpayer_data(taxpayer_id: str, tenant_id: str) -> dict[str, Any]:
+    """Get taxpayer data from knowledge graph"""
+
+    query = """
+    MATCH (t:TaxpayerProfile {taxpayer_id: $taxpayer_id, tenant_id: $tenant_id})
+    WHERE t.retracted_at IS NULL
+    RETURN t
+    """
+    if not neo4j_client:
+        raise Exception("Neo4j client not initialized")
+
+    results = await neo4j_client.run_query(
+        query, {"taxpayer_id": taxpayer_id, "tenant_id": tenant_id}
+    )
+
+    if not results:
+        raise Exception(f"Taxpayer not found: {taxpayer_id}")
+
+    return results[0]["t"]
+
+
+async def _get_latest_calculation(
+    taxpayer_id: str, tax_year: str, tenant_id: str
+) -> dict[str, Any]:
+    """Get latest calculation for taxpayer and tax year"""
+
+    query = """
+    MATCH (c:Calculation {taxpayer_id: $taxpayer_id, tax_year: $tax_year, tenant_id: $tenant_id})
+    WHERE c.retracted_at IS NULL
+    RETURN c
+    ORDER BY c.calculated_at DESC
+    LIMIT 1
+    """
+
+    if not neo4j_client:
+        raise Exception("Neo4j client not initialized")
+
+    results = await neo4j_client.run_query(  # fmt: skip # pyright: ignore[reportOptionalMemberAccess]
+        query,
+        {"taxpayer_id": taxpayer_id, "tax_year": tax_year, "tenant_id": tenant_id},
+    )
+
+    if not results:
+        raise Exception(
+            f"No calculation found for taxpayer {taxpayer_id} and tax year {tax_year}"
+        )
+
+    return results[0]["c"]
+
+
+async def _validate_submission_data(
+    taxpayer_data: dict[str, Any], calculation_data: dict[str, Any]
+) -> dict[str, Any]:
+    """Validate submission data"""
+
+    validation_results: dict[str, bool | list[str]] = {
+        "valid": True,
+        "errors": [],
+        "warnings": [],
+    }
+
+    # Check required taxpayer fields
+    if not taxpayer_data.get("utr"):
+        validation_results["errors"].append("UTR is required")
+        validation_results["valid"] = False
+
+    if not taxpayer_data.get("ni_number"):
+        validation_results["errors"].append("National Insurance number is required")
+        validation_results["valid"] = False
+
+    # Check calculation data
+    if not calculation_data.get("schedule"):
+        validation_results["errors"].append("Schedule is required")
+        validation_results["valid"] = False
+
+    return validation_results
+
+
+async def _prepare_submission_data(
+    taxpayer_data: dict[str, Any], calculation_data: dict[str, Any], tax_year: str
+) -> dict[str, Any]:
+    """Prepare data for HMRC submission"""
+
+    # This would format data according to HMRC MTD API requirements
+    submission_data = {
+        "taxYear": tax_year,
+        "nino": taxpayer_data.get("ni_number"),
+        "utr": taxpayer_data.get("utr"),
+        "schedule": calculation_data.get("schedule"),
+        "submissionTimestamp": datetime.utcnow().isoformat(),
+    }
+
+    return submission_data
+
+
+async def _simulate_hmrc_submission(submission_data: dict[str, Any]) -> dict[str, Any]:
+    """Simulate HMRC submission for dry run"""
+
+    # Simulate processing delay
+    await asyncio.sleep(1)
+
+    return {
+        "status": "accepted",
+        "reference": f"DRY_RUN_{ulid.new()}",
+        "timestamp": datetime.utcnow().isoformat(),
+        "dry_run": True,
+    }
+
+
+async def _submit_to_hmrc_api(
+    submission_data: dict[str, Any], taxpayer_id: str, tenant_id: str
+) -> dict[str, Any]:
+    """Submit to actual HMRC API"""
+
+    # This would implement the actual HMRC MTD API calls
+    # For now, return mock response
+    logger.warning("Actual HMRC API submission not implemented")
+
+    return {
+        "status": "not_implemented",
+        "reference": None,
+        "timestamp": datetime.utcnow().isoformat(),
+        "error": "HMRC API integration not implemented",
+    }
+
+
+async def _store_submission_record(
+    submission_id: str,
+    taxpayer_id: str,
+    tax_year: str,
+    tenant_id: str,
+    hmrc_response: dict[str, Any],
+    validation_results: dict[str, Any],
+    dry_run: bool,
+) -> None:
+    """Store submission record in knowledge graph"""
+
+    submission_properties = {
+        "submission_id": submission_id,
+        "taxpayer_id": taxpayer_id,
+        "tax_year": tax_year,
+        "tenant_id": tenant_id,
+        "status": hmrc_response.get("status"),
+        "hmrc_reference": hmrc_response.get("reference"),
+        "submission_timestamp": hmrc_response.get("timestamp"),
+        "validation_results": json.dumps(validation_results),
+        "dry_run": dry_run,
+        "source": "hmrc_service",
+        "extractor_version": "1.0.0",
+        "valid_from": datetime.utcnow(),
+        "asserted_at": datetime.utcnow(),
+    }
+    if not neo4j_client:
+        raise Exception("Neo4j client not initialized")
+
+    await neo4j_client.create_node("Submission", submission_properties) # fmt: skip # pyright: ignore[reportOptionalMemberAccess]
+
+
+async def _store_submission_error(
+    submission_id: str, error_message: str, tenant_id: str
+) -> None:
+    """Store submission error"""
+
+    error_properties = {
+        "submission_id": submission_id,
+        "tenant_id": tenant_id,
+        "status": "error",
+        "error_message": error_message,
+        "submission_timestamp": datetime.utcnow().isoformat(),
+        "source": "hmrc_service",
+        "extractor_version": "1.0.0",
+        "valid_from": datetime.utcnow(),
+        "asserted_at": datetime.utcnow(),
+    }
+    if not neo4j_client:
+        raise Exception("Neo4j client not initialized")
+
+    await neo4j_client.create_node("Submission", error_properties) # fmt: skip # pyright: ignore[reportOptionalMemberAccess]
+
+
+async def _store_oauth_state(state: str, taxpayer_id: str, tenant_id: str) -> None:
+    """Store OAuth state temporarily"""
+    # This would use Redis or similar for temporary storage
+    # For now, just log
+    logger.debug("OAuth state stored", state=state, taxpayer_id=taxpayer_id)
+
+
+async def _get_oauth_state(state: str) -> dict[str, Any] | None:
+    """Get OAuth state"""
+    # This would retrieve from Redis
+    # For now, return mock data
+    return {"taxpayer_id": "test_taxpayer", "tenant_id": "test_tenant"}
+
+
+async def _delete_oauth_state(state: str) -> None:
+    """Delete OAuth state"""
+    # This would delete from Redis
+    logger.debug("OAuth state deleted", state=state)
+
+
+async def _exchange_code_for_token(code: str) -> dict[str, Any]:
+    """Exchange authorization code for access token"""
+    # This would call HMRC token endpoint
+    # For now, return mock token
+    return {
+        "access_token": "mock_access_token",
+        "refresh_token": "mock_refresh_token",
+        "expires_in": 3600,
+        "scope": "read:self-assessment write:self-assessment",
+    }
+
+
+@app.exception_handler(HTTPException)
+async def http_exception_handler(request: Request, exc: HTTPException) -> JSONResponse:
+    """Handle HTTP exceptions with RFC7807 format"""
+    return JSONResponse(
+        status_code=exc.status_code,
+        content=ErrorResponse(
+            type=f"https://httpstatuses.com/{exc.status_code}",
+            title=exc.detail,
+            status=exc.status_code,
+            detail=exc.detail,
+            instance=str(request.url),
+            trace_id=getattr(request.state, "trace_id", None),
+        ).model_dump(),
+    )
+
+
+if __name__ == "__main__":
+    import uvicorn
+
+    uvicorn.run("main:app", host="0.0.0.0", port=8010, reload=True, log_config=None)
--- a/apps/svc_hmrc/requirements.txt
+++ b/apps/svc_hmrc/requirements.txt
@@ -0,0 +1,40 @@
+# FastAPI and server
+fastapi>=0.104.1
+uvicorn[standard]>=0.24.0
+pydantic>=2.5.0
+
+# Service-specific dependencies
+# OAuth and authentication
+authlib>=1.2.0
+oauthlib>=3.2.0
+
+# HTTP client with OAuth support
+requests-oauthlib>=1.3.0
+
+# XML processing for HMRC APIs
+lxml>=4.9.0
+xmltodict>=0.13.0
+
+# JSON Web Tokens
+pyjwt>=2.8.0
+
+# UK government API utilities
+govuk-frontend-jinja>=2.8.0
+
+# Date and time for tax years
+python-dateutil>=2.8.0
+
+# Retry mechanisms
+tenacity>=8.2.0
+
+# Rate limiting
+ratelimit>=2.2.0
+
+# API validation
+marshmallow>=3.20.0
+
+# Encryption for sensitive data
+cryptography>=41.0.0
+
+# Additional HTTP utilities
+urllib3>=2.1.0
--- a/apps/svc_ingestion/Dockerfile
+++ b/apps/svc_ingestion/Dockerfile
@@ -0,0 +1,54 @@
+# Multi-stage build for svc_ingestion
+FROM python:3.12-slim AS builder
+
+# Install build dependencies
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    curl \
+    && rm -rf /var/lib/apt/lists/*
+
+# Create virtual environment
+RUN python -m venv /opt/venv
+ENV PATH="/opt/venv/bin:$PATH"
+
+# Copy requirements and install dependencies
+# Use base requirements (no ML dependencies for ingestion service)
+COPY libs/requirements-base.txt /tmp/libs-requirements.txt
+COPY apps/svc_ingestion/requirements.txt /tmp/requirements.txt
+RUN pip install --no-cache-dir --upgrade pip && \
+    pip install --no-cache-dir -r /tmp/libs-requirements.txt -r /tmp/requirements.txt
+
+# Production stage
+FROM python:3.12-slim
+
+# Install runtime dependencies
+RUN apt-get update && apt-get install -y \
+    curl \
+    && rm -rf /var/lib/apt/lists/* \
+    && groupadd -r appuser \
+    && useradd -r -g appuser appuser
+
+# Copy virtual environment from builder
+COPY --from=builder /opt/venv /opt/venv
+ENV PATH="/opt/venv/bin:$PATH"
+
+# Set working directory
+WORKDIR /app
+
+# Copy application code
+COPY libs/ ./libs/
+COPY apps/svc_ingestion/ ./apps/svc_ingestion/
+
+# Create non-root user and set permissions
+RUN chown -R appuser:appuser /app
+USER appuser
+
+# Health check
+HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
+    CMD curl -f http://localhost:8000/healthz || exit 1
+
+# Expose port
+EXPOSE 8000
+
+# Run the application
+CMD ["python", "-m", "uvicorn", "apps.svc_ingestion.main:app", "--host", "0.0.0.0", "--port", "8000"]
--- a/apps/svc_ingestion/docker.env
+++ b/apps/svc_ingestion/docker.env
@@ -0,0 +1,10 @@
+# FILE: apps/svc_ingestion/docker.env
+VAULT_ADDR=http://vault:8200
+VAULT_TOKEN=${VAULT_DEV_ROOT_TOKEN_ID:-root}
+MINIO_ENDPOINT=minio:9092
+POSTGRES_URL=postgresql://postgres:${POSTGRES_PASSWORD:-postgres}@postgres:5432/tax_system
+REDIS_URL=redis://redis:6379
+EVENT_BUS_TYPE=${EVENT_BUS_TYPE:-memory}
+NATS_SERVERS=${NATS_SERVERS:-nats://nats:4222}
+NATS_STREAM_NAME=${NATS_STREAM_NAME:-TAX_AGENT_EVENTS}
+NATS_CONSUMER_GROUP=${NATS_CONSUMER_GROUP:-tax-agent}
--- a/apps/svc_ingestion/main.py
+++ b/apps/svc_ingestion/main.py
@@ -0,0 +1,351 @@
+"""Document upload, storage, checksum validation, metadata extraction service."""
+
+import hashlib
+import mimetypes
+import os
+
+# Import shared libraries
+import sys
+from datetime import UTC, datetime
+from typing import Any, cast
+
+import structlog
+import ulid
+from fastapi import Depends, File, HTTPException, Request, UploadFile
+
+sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
+
+from libs.app_factory import create_app, get_tenant_dependency, get_user_dependency
+from libs.config import BaseAppSettings, create_event_bus, create_minio_client
+from libs.events import EventBus, EventPayload, EventTopics
+from libs.observability import get_metrics, get_tracer
+from libs.schemas import DocumentKind, DocumentUploadResponse
+from libs.storage import DocumentStorage, StorageClient
+
+logger = structlog.get_logger()
+
+
+class IngestionSettings(BaseAppSettings):
+    """Settings for ingestion service"""
+
+    service_name: str = "svc-ingestion"
+
+    # File upload limits
+    max_file_size: int = 50 * 1024 * 1024  # 50MB
+    allowed_mime_types: list[str] = [
+        "application/pdf",
+        "image/jpeg",
+        "image/png",
+        "image/tiff",
+        "text/csv",
+        "application/vnd.ms-excel",
+        "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+    ]
+
+    # Storage configuration
+    raw_documents_bucket: str = "raw-documents"
+    evidence_bucket: str = "evidence"
+
+
+# Global clients (will be initialized in startup)
+storage_client: StorageClient | None = None
+document_storage: DocumentStorage | None = None
+event_bus: EventBus | None = None
+
+# Settings will be initialized after app creation
+settings: IngestionSettings
+
+
+def init_dependencies(app_settings: IngestionSettings) -> None:
+    """Initialize service dependencies"""
+    global storage_client, document_storage, event_bus, settings
+
+    settings = app_settings
+    logger.info(
+        "Starting ingestion service",
+        minio_endpoint=settings.minio_endpoint,
+        minio_access_key=settings.minio_access_key,
+    )
+
+    # Initialize clients
+    minio_client = create_minio_client(settings)
+    storage_client = StorageClient(minio_client)
+    document_storage = DocumentStorage(storage_client)
+    event_bus = create_event_bus(settings)
+
+    logger.info("Ingestion service started successfully")
+
+
+# Create app and settings
+app, _settings = create_app(
+    service_name="svc-ingestion",
+    title="Tax Agent Ingestion Service",
+    description="Document upload and storage service",
+    settings_class=IngestionSettings,
+)
+
+# Initialize dependencies immediately
+init_dependencies(cast(IngestionSettings, _settings))
+
+# Get observability components
+tracer = get_tracer("svc-ingestion")
+metrics = get_metrics("svc-ingestion")
+
+
+# Health endpoints are provided by app_factory
+
+
+@app.post("/upload", response_model=DocumentUploadResponse)
+async def upload_document(
+    request: Request,
+    file: UploadFile = File(...),
+    kind: DocumentKind = DocumentKind.INVOICE,
+    source: str = "manual_upload",
+    current_user: dict[str, Any] = Depends(get_user_dependency()),
+    tenant_id: str = Depends(get_tenant_dependency()),
+) -> DocumentUploadResponse:
+    """Upload document for processing"""
+
+    # Check if services are initialized
+    if document_storage is None or event_bus is None:
+        raise HTTPException(
+            status_code=503, detail="Service not ready - dependencies not initialized"
+        )
+
+    with tracer.start_as_current_span("upload_document") as span:
+        span.set_attribute("tenant_id", tenant_id)
+        span.set_attribute("document_kind", kind.value)
+        span.set_attribute("source", source)
+
+        try:
+            # Validate file
+            await _validate_upload(file)
+
+            # Generate document ID
+            doc_id = f"doc_{ulid.new()}"
+            span.set_attribute("doc_id", doc_id)
+
+            # Read file content
+            content = await file.read()
+
+            # Calculate checksum
+            checksum = hashlib.sha256(content).hexdigest()
+
+            # Detect MIME type
+            detected_mime = None
+            if file.filename:
+                detected_mime = mimetypes.guess_type(file.filename)[0]
+            content_type = (
+                detected_mime or file.content_type or "application/octet-stream"
+            )
+
+            # Store document
+            storage_result = await document_storage.store_document(
+                tenant_id=tenant_id,
+                doc_id=doc_id,
+                content=content,
+                content_type=content_type,
+                metadata={
+                    "original_filename": file.filename or "unknown",
+                    "kind": kind.value,
+                    "source": source,
+                    "uploaded_by": current_user.get("sub", "unknown"),
+                    "uploaded_at": datetime.now(UTC).isoformat(),
+                },
+            )
+
+            # Publish event
+            event_payload = EventPayload(
+                data={
+                    "doc_id": doc_id,
+                    "tenant_id": tenant_id,
+                    "kind": kind.value,
+                    "source": source,
+                    "checksum": checksum,
+                    "file_size": len(content),
+                    "content_type": content_type,
+                    "s3_url": storage_result["s3_url"],
+                },
+                actor=current_user.get("sub", "system"),
+                tenant_id=tenant_id,
+                trace_id=str(span.get_span_context().trace_id),
+            )
+
+            await event_bus.publish(EventTopics.DOC_INGESTED, event_payload)
+
+            # Update metrics
+            metrics.counter(
+                "documents_uploaded_total", labelnames=["tenant_id", "kind", "source"]
+            ).labels(tenant_id=tenant_id, kind=kind.value, source=source).inc()
+
+            metrics.histogram(
+                "document_size_bytes", labelnames=["tenant_id", "kind"]
+            ).labels(tenant_id=tenant_id, kind=kind.value).observe(len(content))
+
+            logger.info(
+                "Document uploaded successfully",
+                doc_id=doc_id,
+                tenant_id=tenant_id,
+                kind=kind.value,
+                size=len(content),
+                checksum=checksum,
+            )
+
+            return DocumentUploadResponse(
+                doc_id=doc_id, s3_url=storage_result["s3_url"], checksum=checksum
+            )
+
+        except ValueError as e:
+            logger.warning("Upload validation failed", error=str(e))
+            # Track validation errors
+            try:
+                metrics.counter(
+                    "upload_errors_total", labelnames=["tenant_id", "error_type"]
+                ).labels(tenant_id=tenant_id, error_type="ValueError").inc()
+            except Exception:
+                pass  # Don't fail on metrics errors
+            raise HTTPException(status_code=400, detail=str(e))
+        except Exception as e:
+            logger.error("Upload failed", error=str(e))
+            # Track upload errors
+            try:
+                metrics.counter(
+                    "upload_errors_total", labelnames=["tenant_id", "error_type"]
+                ).labels(tenant_id=tenant_id, error_type=type(e).__name__).inc()
+            except Exception:
+                pass  # Don't fail on metrics errors
+            raise HTTPException(status_code=500, detail="Upload failed")
+
+
+@app.get("/documents/{doc_id}")
+async def get_document_info(
+    doc_id: str,
+    current_user: dict[str, Any] = Depends(get_user_dependency()),
+    tenant_id: str = Depends(get_tenant_dependency()),
+) -> dict[str, str]:
+    """Get document information"""
+
+    # Check if services are initialized
+    if storage_client is None:
+        raise HTTPException(
+            status_code=503, detail="Service not ready - dependencies not initialized"
+        )
+
+    with tracer.start_as_current_span("get_document_info") as span:
+        span.set_attribute("doc_id", doc_id)
+        span.set_attribute("tenant_id", tenant_id)
+
+        try:
+            # Check if document exists
+            ingestion_settings = cast(IngestionSettings, settings)
+            bucket_name = ingestion_settings.raw_documents_bucket
+            object_key = f"tenants/{tenant_id}/raw/{doc_id}.pdf"
+
+            exists = await storage_client.object_exists(bucket_name, object_key)
+
+            if not exists:
+                raise HTTPException(status_code=404, detail="Document not found")
+
+            # Get presigned URL for download
+            download_url = await storage_client.get_presigned_url(
+                bucket_name=bucket_name, object_name=object_key, method="GET"
+            )
+
+            if not download_url:
+                raise HTTPException(
+                    status_code=500, detail="Failed to generate download URL"
+                )
+
+            return {
+                "doc_id": doc_id,
+                "download_url": download_url,
+                "s3_url": f"s3://{bucket_name}/{object_key}",
+            }
+
+        except HTTPException:
+            raise
+        except Exception as e:
+            logger.error("Failed to get document info", doc_id=doc_id, error=str(e))
+            raise HTTPException(status_code=500, detail="Failed to get document info")
+
+
+@app.delete("/documents/{doc_id}")
+async def delete_document(
+    doc_id: str,
+    current_user: dict[str, Any] = Depends(get_user_dependency()),
+    tenant_id: str = Depends(get_tenant_dependency()),
+) -> dict[str, str]:
+    """Delete document"""
+
+    # Check if services are initialized
+    if storage_client is None:
+        raise HTTPException(
+            status_code=503, detail="Service not ready - dependencies not initialized"
+        )
+
+    with tracer.start_as_current_span("delete_document") as span:
+        span.set_attribute("doc_id", doc_id)
+        span.set_attribute("tenant_id", tenant_id)
+
+        try:
+            # Delete from storage
+            ingestion_settings = cast(IngestionSettings, settings)
+            bucket_name = ingestion_settings.raw_documents_bucket
+            object_key = f"tenants/{tenant_id}/raw/{doc_id}.pdf"
+
+            success = await storage_client.delete_object(bucket_name, object_key)
+
+            if not success:
+                raise HTTPException(status_code=404, detail="Document not found")
+
+            logger.info("Document deleted", doc_id=doc_id, tenant_id=tenant_id)
+
+            return {"message": "Document deleted successfully"}
+
+        except HTTPException:
+            raise
+        except Exception as e:
+            logger.error("Failed to delete document", doc_id=doc_id, error=str(e))
+            raise HTTPException(status_code=500, detail="Failed to delete document")
+
+
+async def _validate_upload(file: UploadFile) -> None:
+    """Validate uploaded file"""
+
+    # Cast settings to the correct type
+    ingestion_settings = cast(IngestionSettings, settings)
+
+    # Check file size
+    if file.size and file.size > ingestion_settings.max_file_size:
+        raise ValueError(
+            f"File too large: {file.size} bytes (max: {ingestion_settings.max_file_size})"
+        )
+
+    # Check MIME type
+    if file.content_type not in ingestion_settings.allowed_mime_types:
+        # Try to detect MIME type from filename
+        detected_mime = None
+        if file.filename:
+            detected_mime = mimetypes.guess_type(file.filename)[0]
+        if detected_mime not in ingestion_settings.allowed_mime_types:
+            raise ValueError(f"Unsupported file type: {file.content_type}")
+
+    # Check filename
+    if not file.filename:
+        raise ValueError("Filename is required")
+
+    # Check for malicious filenames
+    if ".." in file.filename or "/" in file.filename or "\\" in file.filename:
+        raise ValueError("Invalid filename")
+
+
+if __name__ == "__main__":
+    import uvicorn
+
+    uvicorn.run(
+        "main:app",
+        host="0.0.0.0",
+        port=8000,
+        reload=True,
+        log_config=None,  # Use structlog configuration
+    )
--- a/apps/svc_ingestion/requirements.txt
+++ b/apps/svc_ingestion/requirements.txt
@@ -0,0 +1,9 @@
+# Service-specific dependencies for svc_ingestion
+# File upload and processing
+aiofiles>=23.2.0
+
+# MIME type detection
+python-magic>=0.4.27
+
+# Image processing (for thumbnails) - lightweight
+Pillow>=10.1.0
--- a/apps/svc_kg/Dockerfile
+++ b/apps/svc_kg/Dockerfile
@@ -0,0 +1,54 @@
+# Multi-stage build for svc_kg
+FROM python:3.12-slim AS builder
+
+# Install build dependencies
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    curl \
+    && rm -rf /var/lib/apt/lists/*
+
+# Create virtual environment
+RUN python -m venv /opt/venv
+ENV PATH="/opt/venv/bin:$PATH"
+
+# Copy requirements and install dependencies
+COPY libs/requirements-base.txt /tmp/libs-requirements.txt
+COPY libs/requirements-rdf.txt /tmp/libs-rdf.txt
+COPY apps/svc_kg/requirements.txt /tmp/requirements.txt
+RUN pip install --no-cache-dir --upgrade pip && \
+    pip install --no-cache-dir -r /tmp/libs-requirements.txt -r /tmp/libs-rdf.txt -r /tmp/requirements.txt
+
+# Production stage
+FROM python:3.12-slim
+
+# Install runtime dependencies
+RUN apt-get update && apt-get install -y \
+    curl \
+    && rm -rf /var/lib/apt/lists/* \
+    && groupadd -r appuser \
+    && useradd -r -g appuser appuser
+
+# Copy virtual environment from builder
+COPY --from=builder /opt/venv /opt/venv
+ENV PATH="/opt/venv/bin:$PATH"
+
+# Set working directory
+WORKDIR /app
+
+# Copy application code
+COPY libs/ ./libs/
+COPY apps/svc_kg/ ./apps/svc_kg/
+
+# Create non-root user and set permissions
+RUN chown -R appuser:appuser /app
+USER appuser
+
+# Health check
+HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
+    CMD curl -f http://localhost:8000/healthz || exit 1
+
+# Expose port
+EXPOSE 8000
+
+# Run the application
+CMD ["python", "-m", "uvicorn", "apps.svc_kg.main:app", "--host", "0.0.0.0", "--port", "8000"]
--- a/apps/svc_kg/main.py
+++ b/apps/svc_kg/main.py
@@ -0,0 +1,572 @@
+# FILE: apps/svc-kg/main.py
+
+# Knowledge graph facade with CRUD, queries, lineage, and SHACL validation
+
+import json
+import os
+
+# Import shared libraries
+import sys
+from datetime import datetime
+from typing import Any
+
+import structlog
+from fastapi import Depends, HTTPException, Query, Request
+from fastapi.responses import JSONResponse
+
+sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
+
+from libs.app_factory import create_app
+from libs.config import BaseAppSettings, create_event_bus, create_neo4j_client
+from libs.events import EventBus
+from libs.neo import Neo4jClient, SHACLValidator, TemporalQueries
+from libs.observability import get_metrics, get_tracer, setup_observability
+from libs.schemas import ErrorResponse
+from libs.security import get_current_user, get_tenant_id
+
+logger = structlog.get_logger()
+
+
+class KGSettings(BaseAppSettings):
+    """Settings for KG service"""
+
+    service_name: str = "svc-kg"
+
+    # SHACL validation
+    shapes_file: str = "schemas/shapes.ttl"
+    validate_on_write: bool = True
+
+    # Query limits
+    max_results: int = 1000
+    max_depth: int = 10
+    query_timeout: int = 30
+
+
+# Create app and settings
+app, settings = create_app(
+    service_name="svc-kg",
+    title="Tax Agent Knowledge Graph Service",
+    description="Knowledge graph facade with CRUD and queries",
+    settings_class=KGSettings,
+)
+
+# Global clients
+neo4j_client: Neo4jClient | None = None
+shacl_validator: SHACLValidator | None = None
+event_bus: EventBus | None = None
+tracer = get_tracer("svc-kg")
+metrics = get_metrics()
+
+
+@app.on_event("startup")
+async def startup_event() -> None:
+    """Initialize service dependencies"""
+    global neo4j_client, shacl_validator, event_bus
+
+    logger.info("Starting KG service")
+
+    # Setup observability
+    setup_observability(settings)
+
+    # Initialize Neo4j client
+    neo4j_driver = create_neo4j_client(settings)
+    neo4j_client = Neo4jClient(neo4j_driver)
+
+    # Initialize SHACL validator
+    if os.path.exists(settings.shapes_file):
+        shacl_validator = SHACLValidator(settings.shapes_file)
+
+    # Initialize event bus
+    event_bus = create_event_bus(settings)
+    await event_bus.start()
+
+    logger.info("KG service started successfully")
+
+
+@app.on_event("shutdown")
+async def shutdown_event() -> None:
+    """Cleanup service dependencies"""
+    global neo4j_client, event_bus
+
+    logger.info("Shutting down KG service")
+
+    if neo4j_client:
+        await neo4j_client.close()
+
+    if event_bus:
+        await event_bus.stop()
+
+    logger.info("KG service shutdown complete")
+
+
+@app.get("/health")
+async def health_check() -> dict[str, Any]:
+    """Health check endpoint"""
+    return {
+        "status": "healthy",
+        "service": settings.service_name,
+        "version": settings.service_version,
+        "timestamp": datetime.utcnow().isoformat(),
+    }
+
+
+@app.post("/nodes/{label}")
+async def create_node(
+    label: str,
+    properties: dict[str, Any],
+    current_user: dict[str, Any] = Depends(get_current_user),
+    tenant_id: str = Depends(get_tenant_id),
+) -> dict[str, Any]:
+    """Create a new node"""
+
+    with tracer.start_as_current_span("create_node") as span:
+        span.set_attribute("label", label)
+        span.set_attribute("tenant_id", tenant_id)
+
+        try:
+            # Add tenant isolation
+            properties["tenant_id"] = tenant_id
+            properties["created_by"] = current_user.get("sub", "system")
+
+            # Validate with SHACL if enabled
+            if settings.validate_on_write and shacl_validator:
+                await _validate_node(label, properties)
+
+            # Create node
+            result = await neo4j_client.create_node(label, properties)
+
+            # Update metrics
+            metrics.counter("nodes_created_total").labels(
+                tenant_id=tenant_id, label=label
+            ).inc()
+
+            logger.info("Node created", label=label, node_id=result.get("id"))
+
+            return {
+                "status": "created",
+                "label": label,
+                "properties": properties,
+                "neo4j_result": result,
+            }
+
+        except Exception as e:
+            logger.error("Failed to create node", label=label, error=str(e))
+            raise HTTPException(
+                status_code=500, detail=f"Failed to create node: {str(e)}"
+            )
+
+
+@app.get("/nodes/{label}")
+async def get_nodes(
+    label: str,
+    limit: int = Query(default=100, le=settings.max_results),
+    filters: str | None = Query(default=None),
+    current_user: dict[str, Any] = Depends(get_current_user),
+    tenant_id: str = Depends(get_tenant_id),
+) -> dict[str, Any]:
+    """Get nodes by label with optional filters"""
+
+    with tracer.start_as_current_span("get_nodes") as span:
+        span.set_attribute("label", label)
+        span.set_attribute("tenant_id", tenant_id)
+        span.set_attribute("limit", limit)
+
+        try:
+            # Parse filters
+            filter_dict: dict[str, Any] = {}
+            if filters:
+                try:
+                    filter_dict = json.loads(filters)
+                except json.JSONDecodeError:
+                    raise HTTPException(status_code=400, detail="Invalid filters JSON")
+
+            # Add tenant isolation
+            filter_dict["tenant_id"] = tenant_id
+
+            # Build query
+            query = TemporalQueries.get_current_state_query(label, filter_dict)
+            query += f" LIMIT {limit}"
+
+            # Execute query
+            results = await neo4j_client.run_query(query)
+
+            # Update metrics
+            metrics.counter("nodes_queried_total").labels(
+                tenant_id=tenant_id, label=label
+            ).inc()
+
+            return {
+                "label": label,
+                "count": len(results),
+                "nodes": [result["n"] for result in results],
+            }
+
+        except HTTPException:
+            raise
+        except Exception as e:
+            logger.error("Failed to get nodes", label=label, error=str(e))
+            raise HTTPException(
+                status_code=500, detail=f"Failed to get nodes: {str(e)}"
+            )
+
+
+@app.get("/nodes/{label}/{node_id}")
+async def get_node(
+    label: str,
+    node_id: str,
+    include_lineage: bool = Query(default=False),
+    current_user: dict[str, Any] = Depends(get_current_user),
+    tenant_id: str = Depends(get_tenant_id),
+) -> dict[str, Any]:
+    """Get specific node with optional lineage"""
+
+    with tracer.start_as_current_span("get_node") as span:
+        span.set_attribute("label", label)
+        span.set_attribute("node_id", node_id)
+        span.set_attribute("tenant_id", tenant_id)
+
+        try:
+            # Get node
+            query = f"""
+            MATCH (n:{label} {{id: $node_id, tenant_id: $tenant_id}})
+            WHERE n.retracted_at IS NULL
+            RETURN n
+            """
+
+            results = await neo4j_client.run_query(
+                query, {"node_id": node_id, "tenant_id": tenant_id}
+            )
+
+            if not results:
+                raise HTTPException(status_code=404, detail="Node not found")
+
+            node_data = results[0]["n"]
+
+            # Get lineage if requested
+            lineage: list[dict[str, Any]] = []
+            if include_lineage:
+                lineage = await neo4j_client.get_node_lineage(node_id)
+
+            return {"node": node_data, "lineage": lineage if include_lineage else None}
+
+        except HTTPException:
+            raise
+        except Exception as e:
+            logger.error(
+                "Failed to get node", label=label, node_id=node_id, error=str(e)
+            )
+            raise HTTPException(status_code=500, detail=f"Failed to get node: {str(e)}")
+
+
+@app.put("/nodes/{label}/{node_id}")
+async def update_node(
+    label: str,
+    node_id: str,
+    properties: dict[str, Any],
+    current_user: dict[str, Any] = Depends(get_current_user),
+    tenant_id: str = Depends(get_tenant_id),
+) -> dict[str, Any]:
+    """Update node with bitemporal versioning"""
+
+    with tracer.start_as_current_span("update_node") as span:
+        span.set_attribute("label", label)
+        span.set_attribute("node_id", node_id)
+        span.set_attribute("tenant_id", tenant_id)
+
+        try:
+            # Add metadata
+            properties["tenant_id"] = tenant_id
+            properties["updated_by"] = current_user.get("sub", "system")
+
+            # Validate with SHACL if enabled
+            if settings.validate_on_write and shacl_validator:
+                await _validate_node(label, properties)
+
+            # Update node (creates new version)
+            await neo4j_client.update_node(label, node_id, properties)
+
+            # Update metrics
+            metrics.counter("nodes_updated_total").labels(
+                tenant_id=tenant_id, label=label
+            ).inc()
+
+            logger.info("Node updated", label=label, node_id=node_id)
+
+            return {
+                "status": "updated",
+                "label": label,
+                "node_id": node_id,
+                "properties": properties,
+            }
+
+        except Exception as e:
+            logger.error(
+                "Failed to update node", label=label, node_id=node_id, error=str(e)
+            )
+            raise HTTPException(
+                status_code=500, detail=f"Failed to update node: {str(e)}"
+            )
+
+
+@app.post("/relationships")
+async def create_relationship(
+    from_label: str,
+    from_id: str,
+    to_label: str,
+    to_id: str,
+    relationship_type: str,
+    properties: dict[str, Any] | None = None,
+    current_user: dict[str, Any] = Depends(get_current_user),
+    tenant_id: str = Depends(get_tenant_id),
+) -> dict[str, Any]:
+    """Create relationship between nodes"""
+
+    with tracer.start_as_current_span("create_relationship") as span:
+        span.set_attribute("from_label", from_label)
+        span.set_attribute("to_label", to_label)
+        span.set_attribute("relationship_type", relationship_type)
+        span.set_attribute("tenant_id", tenant_id)
+
+        try:
+            # Add metadata
+            rel_properties = properties or {}
+            rel_properties["tenant_id"] = tenant_id
+            rel_properties["created_by"] = current_user.get("sub", "system")
+
+            # Create relationship
+            await neo4j_client.create_relationship(
+                from_label, from_id, to_label, to_id, relationship_type, rel_properties
+            )
+
+            # Update metrics
+            metrics.counter("relationships_created_total").labels(
+                tenant_id=tenant_id, relationship_type=relationship_type
+            ).inc()
+
+            logger.info(
+                "Relationship created",
+                from_id=from_id,
+                to_id=to_id,
+                type=relationship_type,
+            )
+
+            return {
+                "status": "created",
+                "from_id": from_id,
+                "to_id": to_id,
+                "relationship_type": relationship_type,
+                "properties": rel_properties,
+            }
+
+        except Exception as e:
+            logger.error("Failed to create relationship", error=str(e))
+            raise HTTPException(
+                status_code=500, detail=f"Failed to create relationship: {str(e)}"
+            )
+
+
+@app.post("/query")
+async def execute_query(
+    query: str,
+    parameters: dict[str, Any] | None = None,
+    current_user: dict[str, Any] = Depends(get_current_user),
+    tenant_id: str = Depends(get_tenant_id),
+) -> dict[str, Any]:
+    """Execute custom Cypher query with tenant isolation"""
+
+    with tracer.start_as_current_span("execute_query") as span:
+        span.set_attribute("tenant_id", tenant_id)
+
+        try:
+            # Add tenant isolation to parameters
+            query_params = parameters or {}
+            query_params["tenant_id"] = tenant_id
+
+            # Validate query (basic security check)
+            if not _is_safe_query(query):
+                raise HTTPException(status_code=400, detail="Unsafe query detected")
+
+            # Execute query with timeout
+            results = await neo4j_client.run_query(query, query_params, max_retries=1)
+
+            # Update metrics
+            metrics.counter("custom_queries_total").labels(tenant_id=tenant_id).inc()
+
+            return {
+                "query": query,
+                "parameters": query_params,
+                "results": results,
+                "count": len(results),
+            }
+
+        except Exception as e:
+            logger.error("Query execution failed", query=query[:100], error=str(e))
+            raise HTTPException(status_code=500, detail=f"Query failed: {str(e)}")
+
+
+@app.get("/export/rdf")
+async def export_rdf(
+    format: str = Query(default="turtle"),
+    current_user: dict[str, Any] = Depends(get_current_user),
+    tenant_id: str = Depends(get_tenant_id),
+) -> dict[str, Any]:
+    """Export knowledge graph as RDF"""
+
+    with tracer.start_as_current_span("export_rdf") as span:
+        span.set_attribute("format", format)
+        span.set_attribute("tenant_id", tenant_id)
+
+        try:
+            # Export tenant-specific data
+            rdf_data = await neo4j_client.export_to_rdf(format)
+
+            # Update metrics
+            metrics.counter("rdf_exports_total").labels(
+                tenant_id=tenant_id, format=format
+            ).inc()
+
+            return {
+                "format": format,
+                "rdf_data": rdf_data,
+                "exported_at": datetime.utcnow().isoformat(),
+            }
+
+        except Exception as e:
+            logger.error("RDF export failed", format=format, error=str(e))
+            raise HTTPException(
+                status_code=500, detail=f"RDF export failed: {str(e)}"
+            ) from e
+
+
+@app.post("/validate")
+async def validate_graph(
+    current_user: dict[str, Any] = Depends(get_current_user),
+    tenant_id: str = Depends(get_tenant_id),
+) -> dict[str, Any]:
+    """Validate knowledge graph with SHACL"""
+
+    with tracer.start_as_current_span("validate_graph") as span:
+        span.set_attribute("tenant_id", tenant_id)
+
+        try:
+            if not shacl_validator:
+                raise HTTPException(
+                    status_code=501, detail="SHACL validation not configured"
+                )
+
+            # Export current graph state
+            rdf_export = await neo4j_client.export_to_rdf("turtle")
+
+            # Extract RDF data from export result
+            rdf_data = rdf_export.get("rdf_data", "")
+            if not rdf_data:
+                raise HTTPException(
+                    status_code=500, detail="Failed to export RDF data for validation"
+                )
+
+            # Run SHACL validation
+            validation_result = await shacl_validator.validate_graph(rdf_data)
+
+            # Update metrics
+            metrics.counter("validations_total").labels(
+                tenant_id=tenant_id, conforms=validation_result["conforms"]
+            ).inc()
+
+            return {
+                "conforms": validation_result["conforms"],
+                "violations_count": validation_result["violations_count"],
+                "results_text": validation_result["results_text"],
+                "validated_at": datetime.utcnow().isoformat(),
+            }
+
+        except Exception as e:
+            logger.error("Graph validation failed", error=str(e))
+            raise HTTPException(status_code=500, detail=f"Validation failed: {str(e)}")
+
+
+async def _validate_node(label: str, properties: dict[str, Any]) -> bool:
+    """Validate node with SHACL"""
+    if not shacl_validator:
+        return True
+
+    try:
+        # Create a minimal RDF representation of the node for validation
+        rdf_lines = ["@prefix tax: <https://tax-kg.example.com/> ."]
+        node_uri = "tax:temp_node"
+
+        # Add type declaration
+        rdf_lines.append(f"{node_uri} a tax:{label} .")
+
+        # Add properties
+        for prop, value in properties.items():
+            if isinstance(value, str):
+                rdf_lines.append(f'{node_uri} tax:{prop} "{value}" .')
+            else:
+                rdf_lines.append(f"{node_uri} tax:{prop} {value} .")
+
+        rdf_data = "\n".join(rdf_lines)
+
+        # Validate the node RDF data
+        validation_result = await shacl_validator.validate_graph(rdf_data)
+
+        if not validation_result["conforms"]:
+            logger.warning(
+                "Node SHACL validation failed",
+                label=label,
+                violations=validation_result["violations_count"],
+                details=validation_result["results_text"],
+            )
+            return False
+
+        logger.debug("Node SHACL validation passed", label=label)
+        return True
+
+    except Exception as e:
+        logger.error("Node SHACL validation error", label=label, error=str(e))
+        # Return True to not block operations on validation errors
+        return True
+
+
+def _is_safe_query(query: str) -> bool:
+    """Basic query safety check"""
+    query_lower = query.lower()
+
+    # Block dangerous operations
+    dangerous_keywords = [
+        "delete",
+        "remove",
+        "drop",
+        "create index",
+        "create constraint",
+        "load csv",
+        "call",
+        "foreach",
+    ]
+
+    for keyword in dangerous_keywords:
+        if keyword in query_lower:
+            return False
+
+    return True
+
+
+@app.exception_handler(HTTPException)
+async def http_exception_handler(request: Request, exc: HTTPException) -> JSONResponse:
+    """Handle HTTP exceptions with RFC7807 format"""
+    return JSONResponse(
+        status_code=exc.status_code,
+        content=ErrorResponse(
+            type=f"https://httpstatuses.com/{exc.status_code}",
+            title=exc.detail,
+            status=exc.status_code,
+            detail=exc.detail,
+            instance=str(request.url),
+            trace_id="",
+        ).model_dump(),
+    )
+
+
+if __name__ == "__main__":
+    import uvicorn
+
+    uvicorn.run("main:app", host="0.0.0.0", port=8005, reload=True, log_config=None)
--- a/apps/svc_kg/requirements.txt
+++ b/apps/svc_kg/requirements.txt
@@ -0,0 +1,22 @@
+# Service-specific dependencies
+# RDF and semantic web
+rdflib>=7.0.0
+pyshacl>=0.25.0
+
+# Graph algorithms
+networkx>=3.2.0
+
+# Data export formats
+xmltodict>=0.13.0
+
+# Query optimization
+pyparsing>=3.1.0
+
+# Graph visualization (optional)
+graphviz>=0.20.0
+
+# Additional Neo4j utilities
+neomodel>=5.2.0
+
+# Cypher query building
+py2neo>=2021.2.4
--- a/apps/svc_normalize_map/Dockerfile
+++ b/apps/svc_normalize_map/Dockerfile
@@ -0,0 +1,53 @@
+# Multi-stage build for svc_normalize_map
+FROM python:3.12-slim AS builder
+
+# Install build dependencies
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    curl \
+    && rm -rf /var/lib/apt/lists/*
+
+# Create virtual environment
+RUN python -m venv /opt/venv
+ENV PATH="/opt/venv/bin:$PATH"
+
+# Copy requirements and install dependencies
+COPY libs/requirements-base.txt /tmp/libs-requirements.txt
+COPY apps/svc_normalize_map/requirements.txt /tmp/requirements.txt
+RUN pip install --no-cache-dir --upgrade pip && \
+    pip install --no-cache-dir -r /tmp/libs-requirements.txt -r /tmp/requirements.txt
+
+# Production stage
+FROM python:3.12-slim
+
+# Install runtime dependencies
+RUN apt-get update && apt-get install -y \
+    curl \
+    && rm -rf /var/lib/apt/lists/* \
+    && groupadd -r appuser \
+    && useradd -r -g appuser appuser
+
+# Copy virtual environment from builder
+COPY --from=builder /opt/venv /opt/venv
+ENV PATH="/opt/venv/bin:$PATH"
+
+# Set working directory
+WORKDIR /app
+
+# Copy application code
+COPY libs/ ./libs/
+COPY apps/svc_normalize_map/ ./apps/svc_normalize_map/
+
+# Create non-root user and set permissions
+RUN chown -R appuser:appuser /app
+USER appuser
+
+# Health check
+HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
+    CMD curl -f http://localhost:8000/healthz || exit 1
+
+# Expose port
+EXPOSE 8000
+
+# Run the application
+CMD ["python", "-m", "uvicorn", "apps.svc_normalize_map.main:app", "--host", "0.0.0.0", "--port", "8000"]
--- a/apps/svc_normalize_map/main.py
+++ b/apps/svc_normalize_map/main.py
@@ -0,0 +1,590 @@
+"""Data normalization and knowledge graph mapping."""
+
+# FILE: apps/svc-normalize-map/main.py
+# pylint: disable=wrong-import-position,import-error,too-few-public-methods,global-statement
+# pylint: disable=global-variable-not-assigned,raise-missing-from,unused-argument
+# pylint: disable=broad-exception-caught,no-else-return,too-many-arguments,too-many-positional-arguments
+# pylint: disable=too-many-locals,import-outside-toplevel,too-many-statements
+# mypy: disable-error-code=union-attr
+
+
+import os
+
+# Import shared libraries
+import sys
+from datetime import datetime
+from decimal import Decimal
+from typing import Any
+
+import structlog
+import ulid
+from fastapi import BackgroundTasks, Depends, HTTPException, Request
+from fastapi.responses import JSONResponse
+
+sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
+
+from libs.app_factory import create_app
+from libs.config import (
+    BaseAppSettings,
+    create_event_bus,
+    create_minio_client,
+    create_neo4j_client,
+)
+from libs.events import EventBus, EventPayload, EventTopics
+from libs.neo import Neo4jClient
+from libs.observability import get_metrics, get_tracer, setup_observability
+from libs.schemas import ErrorResponse
+from libs.security import get_current_user, get_tenant_id
+from libs.storage import DocumentStorage, StorageClient
+
+logger = structlog.get_logger()
+
+
+class NormalizeMapSettings(BaseAppSettings):
+    """Settings for normalize-map service"""
+
+    service_name: str = "svc-normalize-map"
+
+    # Normalization configuration
+    currency_default: str = "GBP"
+    date_formats: list[str] = [
+        "%Y-%m-%d",
+        "%d/%m/%Y",
+        "%d-%m-%Y",
+        "%d %B %Y",
+        "%d %b %Y",
+        "%B %d, %Y",
+    ]
+
+    # Mapping configuration
+    confidence_threshold: float = 0.7
+    auto_create_entities: bool = True
+
+    # Validation rules
+    max_amount: float = 1000000.0  # £1M
+    min_confidence: float = 0.5
+
+
+# Create app and settings
+app, settings = create_app(
+    service_name="svc-normalize-map",
+    title="Tax Agent Normalize-Map Service",
+    description="Data normalization and knowledge graph mapping service",
+    settings_class=NormalizeMapSettings,
+)
+
+# Global clients
+storage_client: StorageClient | None = None
+document_storage: DocumentStorage | None = None
+neo4j_client: Neo4jClient | None = None
+event_bus: EventBus | None = None
+tracer = get_tracer("svc-normalize-map")
+metrics = get_metrics()
+
+
+@app.on_event("startup")
+async def startup_event() -> None:
+    """Initialize service dependencies"""
+    global storage_client, document_storage, neo4j_client, event_bus
+
+    logger.info("Starting normalize-map service")
+
+    # Setup observability
+    setup_observability(settings)
+
+    # Initialize MinIO client
+    minio_client = create_minio_client(settings)
+    storage_client = StorageClient(minio_client)
+    document_storage = DocumentStorage(storage_client)
+
+    # Initialize Neo4j client
+    neo4j_driver = create_neo4j_client(settings)
+    neo4j_client = Neo4jClient(neo4j_driver)
+
+    # Initialize event bus
+    event_bus = create_event_bus(settings)
+    await event_bus.start()
+
+    # Subscribe to extraction completion events
+    await event_bus.subscribe(  # type: ignore
+        EventTopics.DOC_EXTRACTED, _handle_extraction_completed
+    )
+
+    logger.info("Normalize-map service started successfully")
+
+
+@app.on_event("shutdown")
+async def shutdown_event() -> None:
+    """Cleanup service dependencies"""
+    global event_bus, neo4j_client
+
+    logger.info("Shutting down normalize-map service")
+
+    if neo4j_client:
+        await neo4j_client.close()
+
+    if event_bus:
+        await event_bus.stop()
+
+    logger.info("Normalize-map service shutdown complete")
+
+
+@app.get("/health")
+async def health_check() -> dict[str, Any]:
+    """Health check endpoint"""
+    return {
+        "status": "healthy",
+        "service": settings.service_name,
+        "version": settings.service_version,
+        "timestamp": datetime.utcnow().isoformat(),
+    }
+
+
+@app.post("/normalize/{doc_id}")
+async def normalize_document(
+    doc_id: str,
+    background_tasks: BackgroundTasks,
+    current_user: dict[str, Any] = Depends(get_current_user),
+    tenant_id: str = Depends(get_tenant_id),
+) -> dict[str, Any]:
+    """Normalize and map document data to knowledge graph"""
+
+    with tracer.start_as_current_span("normalize_document") as span:
+        span.set_attribute("doc_id", doc_id)
+        span.set_attribute("tenant_id", tenant_id)
+
+        try:
+            # Check if extraction results exist
+            extraction_results = await document_storage.get_extraction_result(
+                tenant_id, doc_id
+            )
+            if not extraction_results:
+                raise HTTPException(
+                    status_code=404, detail="Extraction results not found"
+                )
+
+            # Generate normalization ID
+            normalization_id = str(ulid.new())
+            span.set_attribute("normalization_id", normalization_id)
+
+            # Start background normalization
+            background_tasks.add_task(
+                _normalize_and_map_async,
+                doc_id,
+                tenant_id,
+                extraction_results,
+                normalization_id,
+                current_user.get("sub", "system"),
+            )
+
+            logger.info(
+                "Normalization started",
+                doc_id=doc_id,
+                normalization_id=normalization_id,
+            )
+
+            return {
+                "normalization_id": normalization_id,
+                "doc_id": doc_id,
+                "status": "processing",
+            }
+
+        except HTTPException:
+            raise
+        except Exception as e:
+            logger.error("Failed to start normalization", doc_id=doc_id, error=str(e))
+            raise HTTPException(status_code=500, detail="Failed to start normalization")
+
+
+async def _handle_extraction_completed(topic: str, payload: EventPayload) -> None:
+    """Handle extraction completion events"""
+    try:
+        data = payload.data
+        doc_id = data.get("doc_id")
+        tenant_id = data.get("tenant_id")
+        confidence = data.get("confidence", 0.0)
+
+        if not doc_id or not tenant_id:
+            logger.warning("Invalid extraction completion event", data=data)
+            return
+
+        # Only auto-process if confidence is above threshold
+        if confidence >= settings.confidence_threshold:
+            logger.info(
+                "Auto-normalizing extracted document",
+                doc_id=doc_id,
+                confidence=confidence,
+            )
+
+            extraction_results = data.get("extraction_results")
+            if not extraction_results:
+                extraction_results = await document_storage.get_extraction_result(
+                    tenant_id, doc_id
+                )
+
+            if extraction_results:
+                await _normalize_and_map_async(
+                    doc_id=doc_id,
+                    tenant_id=tenant_id,
+                    extraction_results=extraction_results,
+                    normalization_id=str(ulid.new()),
+                    actor=payload.actor,
+                )
+        else:
+            logger.info(
+                "Skipping auto-normalization due to low confidence",
+                doc_id=doc_id,
+                confidence=confidence,
+            )
+
+    except Exception as e:
+        logger.error("Failed to handle extraction completion", error=str(e))
+
+
+async def _normalize_and_map_async(
+    doc_id: str,
+    tenant_id: str,
+    extraction_results: dict[str, Any],
+    normalization_id: str,
+    actor: str,
+) -> None:
+    """Normalize and map data asynchronously"""
+
+    with tracer.start_as_current_span("normalize_and_map_async") as span:
+        span.set_attribute("doc_id", doc_id)
+        span.set_attribute("normalization_id", normalization_id)
+
+        try:
+            extracted_fields = extraction_results.get("extracted_fields", {})
+            provenance = extraction_results.get("provenance", [])
+
+            # Normalize extracted data
+            normalized_data = await _normalize_data(extracted_fields, provenance)
+
+            # Map to knowledge graph entities
+            entities = await _map_to_entities(normalized_data, doc_id, tenant_id)
+
+            # Store entities in knowledge graph
+            stored_entities = await _store_entities(entities, tenant_id)
+
+            # Create normalization results
+            normalization_results = {
+                "doc_id": doc_id,
+                "normalization_id": normalization_id,
+                "normalized_at": datetime.utcnow().isoformat(),
+                "normalized_data": normalized_data,
+                "entities": stored_entities,
+                "entity_count": len(stored_entities),
+            }
+
+            logger.info("Normalization completed", results=normalization_results)
+
+            # Update metrics
+            metrics.counter("documents_normalized_total").labels(
+                tenant_id=tenant_id
+            ).inc()
+
+            metrics.histogram("entities_created").labels(tenant_id=tenant_id).observe(
+                len(stored_entities)
+            )
+
+            # Publish completion event
+            event_payload = EventPayload(
+                data={
+                    "doc_id": doc_id,
+                    "tenant_id": tenant_id,
+                    "normalization_id": normalization_id,
+                    "entity_count": len(stored_entities),
+                    "entities": stored_entities,
+                },
+                actor=actor,
+                tenant_id=tenant_id,
+            )
+
+            await event_bus.publish(EventTopics.KG_UPSERTED, event_payload)
+
+            logger.info(
+                "Normalization completed", doc_id=doc_id, entities=len(stored_entities)
+            )
+
+        except Exception as e:
+            logger.error("Normalization failed", doc_id=doc_id, error=str(e))
+
+            # Update error metrics
+            metrics.counter("normalization_errors_total").labels(
+                tenant_id=tenant_id, error_type=type(e).__name__
+            ).inc()
+
+
+async def _normalize_data(
+    extracted_fields: dict[str, Any], provenance: list[dict[str, Any]]
+) -> dict[str, Any]:
+    """Normalize extracted data"""
+
+    normalized = {}
+
+    for field_name, raw_value in extracted_fields.items():
+        try:
+            if "amount" in field_name.lower() or "total" in field_name.lower():
+                normalized[field_name] = _normalize_amount(raw_value)
+            elif "date" in field_name.lower():
+                normalized[field_name] = _normalize_date(raw_value)
+            elif "name" in field_name.lower():
+                normalized[field_name] = _normalize_name(raw_value)
+            elif "address" in field_name.lower():
+                normalized[field_name] = _normalize_address(raw_value)
+            elif "number" in field_name.lower():
+                normalized[field_name] = _normalize_number(raw_value)
+            else:
+                normalized[field_name] = _normalize_text(raw_value)
+
+        except Exception as e:
+            logger.warning(
+                "Failed to normalize field",
+                field=field_name,
+                value=raw_value,
+                error=str(e),
+            )
+            normalized[field_name] = raw_value  # Keep original value
+
+    return normalized
+
+
+def _normalize_amount(value: str) -> dict[str, Any]:
+    """Normalize monetary amount"""
+    import re
+
+    if not value:
+        return {"amount": None, "currency": settings.currency_default}
+
+    # Remove currency symbols and formatting
+    clean_value = re.sub(r"[£$€,\s]", "", str(value))
+
+    try:
+        amount = Decimal(clean_value)
+
+        # Validate amount
+        if amount > settings.max_amount:
+            logger.warning("Amount exceeds maximum", amount=amount)
+
+        return {
+            "amount": float(amount),
+            "currency": settings.currency_default,
+            "original": value,
+        }
+    except Exception:
+        return {
+            "amount": None,
+            "currency": settings.currency_default,
+            "original": value,
+        }
+
+
+def _normalize_date(value: str) -> dict[str, Any]:
+    """Normalize date"""
+    from dateutil import parser
+
+    if not value:
+        return {"date": None, "original": value}
+
+    try:
+        # Try parsing with dateutil first
+        parsed_date = parser.parse(str(value), dayfirst=True)
+        return {"date": parsed_date.date().isoformat(), "original": value}
+    except Exception:
+        # Try manual formats
+        for fmt in settings.date_formats:
+            try:
+                parsed_date = datetime.strptime(str(value), fmt)
+                return {"date": parsed_date.date().isoformat(), "original": value}
+            except Exception:
+                continue
+
+        return {"date": None, "original": value}
+
+
+def _normalize_name(value: str) -> dict[str, Any]:
+    """Normalize person/company name"""
+    if not value:
+        return {"name": None, "original": value}
+
+    # Clean and title case
+    clean_name = str(value).strip().title()
+
+    # Detect if it's a company (contains Ltd, Limited, etc.)
+    company_indicators = ["Ltd", "Limited", "Plc", "Inc", "Corp", "Company"]
+    is_company = any(indicator in clean_name for indicator in company_indicators)
+
+    return {
+        "name": clean_name,
+        "type": "company" if is_company else "person",
+        "original": value,
+    }
+
+
+def _normalize_address(value: str) -> dict[str, Any]:
+    """Normalize address"""
+    import re
+
+    if not value:
+        return {"address": None, "original": value}
+
+    clean_address = str(value).strip()
+
+    # Extract UK postcode
+    postcode_pattern = r"\b[A-Z]{1,2}\d[A-Z0-9]?\s*\d[A-Z]{2}\b"
+    postcode_match = re.search(postcode_pattern, clean_address, re.IGNORECASE)
+    postcode = postcode_match.group().upper() if postcode_match else None
+
+    return {"address": clean_address, "postcode": postcode, "original": value}
+
+
+def _normalize_number(value: str) -> dict[str, Any]:
+    """Normalize reference numbers"""
+    import re
+
+    if not value:
+        return {"number": None, "original": value}
+
+    # Remove spaces and special characters
+    clean_number = re.sub(r"[^\w]", "", str(value))
+
+    # Detect number type
+    number_type = "unknown"
+    if len(clean_number) == 10 and clean_number.isdigit():
+        number_type = "utr"  # UTR is 10 digits
+    elif len(clean_number) == 8 and clean_number.isdigit():
+        number_type = "account_number"
+    elif re.match(r"^\d{6}$", clean_number):
+        number_type = "sort_code"
+
+    return {"number": clean_number, "type": number_type, "original": value}
+
+
+def _normalize_text(value: str) -> dict[str, Any]:
+    """Normalize general text"""
+    if not value:
+        return {"text": None, "original": value}
+
+    clean_text = str(value).strip()
+
+    return {"text": clean_text, "original": value}
+
+
+async def _map_to_entities(
+    normalized_data: dict[str, Any], doc_id: str, tenant_id: str
+) -> list[dict[str, Any]]:
+    """Map normalized data to knowledge graph entities"""
+
+    entities = []
+
+    # Create document entity
+    doc_entity = {
+        "type": "Document",
+        "id": doc_id,
+        "properties": {
+            "doc_id": doc_id,
+            "tenant_id": tenant_id,
+            "processed_at": datetime.utcnow().isoformat(),
+            "source": "extraction",
+            "extractor_version": "1.0.0",
+            "valid_from": datetime.utcnow(),
+            "asserted_at": datetime.utcnow(),
+        },
+    }
+    entities.append(doc_entity)
+
+    # Map specific field types to entities
+    for field_name, normalized_value in normalized_data.items():
+        if isinstance(normalized_value, dict):
+            if "amount" in normalized_value and normalized_value["amount"] is not None:
+                # Create expense or income item
+                entity_type = (
+                    "ExpenseItem" if "expense" in field_name.lower() else "IncomeItem"
+                )
+                entity = {
+                    "type": entity_type,
+                    "id": f"{entity_type.lower()}_{ulid.new()}",
+                    "properties": {
+                        "amount": normalized_value["amount"],
+                        "currency": normalized_value["currency"],
+                        "description": field_name,
+                        "source": doc_id,
+                        "extractor_version": "1.0.0",
+                        "valid_from": datetime.utcnow(),
+                        "asserted_at": datetime.utcnow(),
+                    },
+                }
+                entities.append(entity)
+
+            elif "name" in normalized_value and normalized_value["name"] is not None:
+                # Create party entity
+                entity = {
+                    "type": "Party",
+                    "id": f"party_{ulid.new()}",
+                    "properties": {
+                        "name": normalized_value["name"],
+                        "party_type": normalized_value.get("type", "unknown"),
+                        "source": doc_id,
+                        "extractor_version": "1.0.0",
+                        "valid_from": datetime.utcnow(),
+                        "asserted_at": datetime.utcnow(),
+                    },
+                }
+                entities.append(entity)
+
+    return entities
+
+
+async def _store_entities(
+    entities: list[dict[str, Any]], tenant_id: str
+) -> list[dict[str, Any]]:
+    """Store entities in knowledge graph"""
+
+    stored_entities = []
+
+    for entity in entities:
+        try:
+            # Create node in Neo4j
+            result = await neo4j_client.create_node(
+                label=entity["type"], properties=entity["properties"]
+            )
+
+            stored_entities.append(
+                {
+                    "type": entity["type"],
+                    "id": entity["id"],
+                    "neo4j_id": result.get("id"),
+                    "properties": entity["properties"],
+                }
+            )
+
+            logger.debug("Entity stored", type=entity["type"], id=entity["id"])
+
+        except Exception as e:
+            logger.error("Failed to store entity", entity=entity, error=str(e))
+
+    return stored_entities
+
+
+
+@app.exception_handler(HTTPException)
+async def http_exception_handler(request: Request, exc: HTTPException) -> JSONResponse:
+    """Handle HTTP exceptions with RFC7807 format"""
+    return JSONResponse(
+        status_code=exc.status_code,
+        content=ErrorResponse(
+            type=f"https://httpstatuses.com/{exc.status_code}",
+            title=exc.detail,
+            status=exc.status_code,
+            detail=exc.detail,
+            instance=str(request.url),
+            trace_id="",
+        ).dict(),
+    )
+
+
+if __name__ == "__main__":
+    import uvicorn
+
+    uvicorn.run("main:app", host="0.0.0.0", port=8004, reload=True, log_config=None)
--- a/apps/svc_normalize_map/requirements.txt
+++ b/apps/svc_normalize_map/requirements.txt
@@ -0,0 +1,37 @@
+# FastAPI and server
+fastapi>=0.104.1
+uvicorn[standard]>=0.24.0
+pydantic>=2.5.0
+
+# Service-specific dependencies
+# Data normalization and cleaning
+pandas>=2.1.0
+numpy>=1.24.0
+
+# Currency and exchange rates
+forex-python>=1.8
+babel>=2.13.0
+
+# Date and time processing
+python-dateutil>=2.8.0
+pytz>=2023.3
+
+# Text normalization
+unidecode>=1.3.0
+phonenumbers>=8.13.0
+
+# Entity resolution and matching
+recordlinkage>=0.16.0
+fuzzywuzzy>=0.18.0
+python-Levenshtein>=0.23.0
+
+# Geographic data
+geopy>=2.4.0
+pycountry>=23.12.0
+
+# Data validation
+cerberus>=1.3.4
+marshmallow>=3.20.0
+
+# UK-specific utilities
+uk-postcode-utils>=1.0.0
--- a/apps/svc_ocr/Dockerfile
+++ b/apps/svc_ocr/Dockerfile
@@ -0,0 +1,43 @@
+# Dockerfile for svc_ocr - Uses base-ml image
+# Base image contains: FastAPI, database drivers, transformers, PyTorch, numpy, etc.
+# This Dockerfile adds OCR-specific dependencies and application code
+
+ARG REGISTRY=gitea.harkon.co.uk
+ARG OWNER=harkon
+ARG BASE_VERSION=v1.0.1
+FROM ${REGISTRY}/${OWNER}/base-ml:${BASE_VERSION}
+
+# Switch to root to install system and service-specific dependencies
+USER root
+
+# Install OCR runtime dependencies (Tesseract, poppler)
+RUN apt-get update && apt-get install -y \
+    tesseract-ocr \
+    tesseract-ocr-eng \
+    poppler-utils \
+    && rm -rf /var/lib/apt/lists/*
+
+# Set working directory
+WORKDIR /app
+
+# Copy service-specific requirements and install
+COPY apps/svc_ocr/requirements.txt /tmp/service-requirements.txt
+RUN pip install --no-cache-dir -r /tmp/service-requirements.txt
+
+# Copy application code
+COPY libs/ ./libs/
+COPY apps/svc_ocr/ ./apps/svc_ocr/
+
+# Set permissions and switch to non-root user
+RUN chown -R appuser:appuser /app
+USER appuser
+
+# Health check
+HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
+    CMD curl -f http://localhost:8000/healthz || exit 1
+
+# Expose port
+EXPOSE 8000
+
+# Run the application
+CMD ["python", "-m", "uvicorn", "apps.svc_ocr.main:app", "--host", "0.0.0.0", "--port", "8000"]
--- a/apps/svc_ocr/main.py
+++ b/apps/svc_ocr/main.py
@@ -0,0 +1,504 @@
+# FILE: apps/svc-ocr/main.py
+# OCR and layout extraction using Tesseract, LayoutLM, and document AI
+
+import os
+
+# Import shared libraries
+import sys
+from datetime import datetime
+from typing import Any
+
+import structlog
+import ulid
+from fastapi import BackgroundTasks, Depends, HTTPException, Request
+from fastapi.responses import JSONResponse
+
+sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
+
+from libs.app_factory import create_app
+from libs.config import BaseAppSettings, create_event_bus, create_minio_client
+from libs.events import EventBus, EventPayload, EventTopics
+from libs.observability import get_metrics, get_tracer, setup_observability
+from libs.schemas import ErrorResponse
+from libs.security import get_current_user, get_tenant_id
+from libs.storage import DocumentStorage, StorageClient
+
+logger = structlog.get_logger()
+
+
+class OCRSettings(BaseAppSettings):
+    """Settings for OCR service"""
+
+    service_name: str = "svc-ocr"
+
+    # OCR configuration
+    tesseract_cmd: str = "/usr/bin/tesseract"
+    tesseract_config: str = "--oem 3 --psm 6"
+    languages: str = "eng"
+
+    # Layout analysis
+    layoutlm_model: str = "microsoft/layoutlm-base-uncased"
+    confidence_threshold: float = 0.7
+
+    # Processing limits
+    max_pages: int = 50
+    max_file_size: int = 100 * 1024 * 1024  # 100MB
+
+    # Output configuration
+    include_coordinates: bool = True
+    include_confidence: bool = True
+
+
+# Create app and settings
+app, settings = create_app(
+    service_name="svc-ocr",
+    title="Tax Agent OCR Service",
+    description="OCR and layout extraction service",
+    settings_class=OCRSettings,
+)  # fmt: skip
+
+# Global clients
+storage_client: StorageClient | None = None
+document_storage: DocumentStorage | None = None
+event_bus: EventBus | None = None
+tracer = get_tracer("svc-ocr")
+metrics = get_metrics()
+
+
+@app.on_event("startup")
+async def startup_event() -> None:
+    """Initialize service dependencies"""
+    global storage_client, document_storage, event_bus
+
+    logger.info("Starting OCR service")
+
+    # Setup observability
+    setup_observability(settings)
+
+    # Initialize MinIO client
+    minio_client = create_minio_client(settings)
+    storage_client = StorageClient(minio_client)
+    document_storage = DocumentStorage(storage_client)
+
+    # Initialize event bus
+    event_bus = create_event_bus(settings)
+    if not event_bus:
+        raise HTTPException(status_code=500, detail="Event bus not initialized")
+
+    await event_bus.start()
+
+    # Subscribe to document ingestion events
+    await event_bus.subscribe(EventTopics.DOC_INGESTED, _handle_document_ingested)
+
+    logger.info("OCR service started successfully")
+
+
+@app.on_event("shutdown")
+async def shutdown_event() -> None:
+    """Cleanup service dependencies"""
+    global event_bus
+
+    logger.info("Shutting down OCR service")
+
+    if event_bus:
+        await event_bus.stop()
+
+    logger.info("OCR service shutdown complete")
+
+
+@app.get("/health")
+async def health_check() -> dict[str, Any]:
+    """Health check endpoint"""
+    return {
+        "status": "healthy",
+        "service": settings.service_name,
+        "version": settings.service_version,
+        "timestamp": datetime.utcnow().isoformat(),
+    }
+
+
+@app.post("/process/{doc_id}")
+async def process_document(
+    doc_id: str,
+    background_tasks: BackgroundTasks,
+    strategy: str = "hybrid",
+    current_user: dict[str, Any] = Depends(get_current_user),
+    tenant_id: str = Depends(get_tenant_id),
+) -> dict[str, Any]:
+    """Process document with OCR"""
+
+    with tracer.start_as_current_span("process_document") as span:
+        span.set_attribute("doc_id", doc_id)
+        span.set_attribute("tenant_id", tenant_id)
+        span.set_attribute("strategy", strategy)
+
+        try:
+            # Check if document exists
+            doc_content = await document_storage.get_document(tenant_id, doc_id)
+            if not doc_content:
+                raise HTTPException(status_code=404, detail="Document not found")
+
+            # Generate processing ID
+            processing_id = str(ulid.new())
+            span.set_attribute("processing_id", processing_id)
+
+            # Start background processing
+            background_tasks.add_task(
+                _process_document_async,
+                doc_id,
+                tenant_id,
+                doc_content,
+                strategy,
+                processing_id,
+                current_user.get("sub", "system"),
+            )
+
+            logger.info(
+                "OCR processing started", doc_id=doc_id, processing_id=processing_id
+            )
+
+            return {
+                "processing_id": processing_id,
+                "doc_id": doc_id,
+                "status": "processing",
+                "strategy": strategy,
+            }
+
+        except HTTPException:
+            raise
+        except Exception as e:
+            logger.error("Failed to start OCR processing", doc_id=doc_id, error=str(e))
+            raise HTTPException(status_code=500, detail="Failed to start processing")
+
+
+@app.get("/results/{doc_id}")
+async def get_ocr_results(
+    doc_id: str,
+    current_user: dict[str, Any] = Depends(get_current_user),
+    tenant_id: str = Depends(get_tenant_id),
+) -> dict[str, Any]:
+    """Get OCR results for document"""
+
+    with tracer.start_as_current_span("get_ocr_results") as span:
+        span.set_attribute("doc_id", doc_id)
+        span.set_attribute("tenant_id", tenant_id)
+
+        try:
+            # Get OCR results from storage
+            ocr_results = await document_storage.get_ocr_result(tenant_id, doc_id)
+
+            if not ocr_results:
+                raise HTTPException(status_code=404, detail="OCR results not found")
+
+            return ocr_results
+
+        except HTTPException:
+            raise
+        except Exception as e:
+            logger.error("Failed to get OCR results", doc_id=doc_id, error=str(e))
+            raise HTTPException(status_code=500, detail="Failed to get OCR results")
+
+
+async def _handle_document_ingested(topic: str, payload: EventPayload) -> None:
+    """Handle document ingestion events"""
+    try:
+        data = payload.data
+        doc_id = data.get("doc_id")
+        tenant_id = data.get("tenant_id")
+
+        if not doc_id or not tenant_id:
+            logger.warning("Invalid document ingestion event", data=data)
+            return
+
+        # Auto-process PDF documents
+        if data.get("content_type") == "application/pdf":
+            logger.info("Auto-processing ingested document", doc_id=doc_id)
+
+            # Get document content
+            doc_content = await document_storage.get_document(tenant_id, doc_id)
+            if doc_content:
+                await _process_document_async(
+                    doc_id=doc_id,
+                    tenant_id=tenant_id,
+                    content=doc_content,
+                    strategy="hybrid",
+                    processing_id=str(ulid.new()),
+                    actor=payload.actor,
+                )
+
+    except Exception as e:
+        logger.error("Failed to handle document ingestion", error=str(e))
+
+
+async def _process_document_async(
+    doc_id: str,
+    tenant_id: str,
+    content: bytes,
+    strategy: str,
+    processing_id: str,
+    actor: str,
+) -> None:
+    """Process document asynchronously"""
+
+    with tracer.start_as_current_span("process_document_async") as span:
+        span.set_attribute("doc_id", doc_id)
+        span.set_attribute("processing_id", processing_id)
+        span.set_attribute("strategy", strategy)
+
+        try:
+            # Convert PDF to images
+            images = await _pdf_to_images(content)
+
+            # Process each page
+            pages_data: list[Any] = []
+            for page_num, image in enumerate(images, 1):
+                page_data = await _process_page(image, page_num, strategy)
+                pages_data.append(page_data)
+
+            # Combine results
+            ocr_results = {
+                "doc_id": doc_id,
+                "processing_id": processing_id,
+                "strategy": strategy,
+                "processed_at": datetime.utcnow().isoformat(),
+                "total_pages": len(pages_data),
+                "pages": pages_data,
+                "metadata": {
+                    "confidence_threshold": settings.confidence_threshold,
+                    "languages": settings.languages,
+                },
+            }
+
+            # Store results
+            await document_storage.store_ocr_result(tenant_id, doc_id, ocr_results)
+
+            # Update metrics
+            metrics.counter("documents_processed_total").labels(
+                tenant_id=tenant_id, strategy=strategy
+            ).inc()
+
+            metrics.histogram("processing_duration_seconds").labels(
+                strategy=strategy
+            ).observe(
+                datetime.utcnow().timestamp()
+                - datetime.fromisoformat(
+                    ocr_results["processed_at"].replace("Z", "")
+                ).timestamp()
+            )
+
+            # Publish completion event
+            event_payload = EventPayload(
+                data={
+                    "doc_id": doc_id,
+                    "tenant_id": tenant_id,
+                    "processing_id": processing_id,
+                    "strategy": strategy,
+                    "total_pages": len(pages_data),
+                    "ocr_results": ocr_results,
+                },
+                actor=actor,
+                tenant_id=tenant_id,
+            )
+
+            await event_bus.publish(EventTopics.DOC_OCR_READY, event_payload)
+
+            logger.info(
+                "OCR processing completed", doc_id=doc_id, pages=len(pages_data)
+            )
+
+        except Exception as e:
+            logger.error("OCR processing failed", doc_id=doc_id, error=str(e))
+
+            # Update error metrics
+            metrics.counter("processing_errors_total").labels(
+                tenant_id=tenant_id, strategy=strategy, error_type=type(e).__name__
+            ).inc()
+
+
+async def _pdf_to_images(pdf_content: bytes) -> list[bytes]:
+    """Convert PDF to images"""
+    try:
+        import fitz  # PyMuPDF
+
+        # Open PDF
+        pdf_doc = fitz.open(stream=pdf_content, filetype="pdf")
+
+        images: list[Any] = []
+        for page_num in range(min(len(pdf_doc), settings.max_pages)):
+            page = pdf_doc[page_num]
+
+            # Render page to image
+            mat = fitz.Matrix(2.0, 2.0)  # 2x zoom for better OCR
+            pix = page.get_pixmap(matrix=mat)
+            img_data = pix.tobytes("png")
+
+            images.append(img_data)
+
+        pdf_doc.close()
+        return images
+
+    except ImportError:
+        logger.error("PyMuPDF not available, using fallback")
+        return await _pdf_to_images_fallback(pdf_content)
+    except Exception as e:
+        logger.error("PDF conversion failed", error=str(e))
+        raise
+
+
+async def _pdf_to_images_fallback(pdf_content: bytes) -> list[bytes]:
+    """Fallback PDF to images conversion"""
+    try:
+        from pdf2image import convert_from_bytes
+
+        images = convert_from_bytes(
+            pdf_content, dpi=200, first_page=1, last_page=settings.max_pages
+        )
+
+        # Convert PIL images to bytes
+        image_bytes: list[Any] = []
+        for img in images:
+            import io
+
+            img_buffer = io.BytesIO()
+            img.save(img_buffer, format="PNG")
+            image_bytes.append(img_buffer.getvalue())
+
+        return image_bytes
+
+    except ImportError:
+        logger.error("pdf2image not available")
+        raise Exception("No PDF conversion library available")
+
+
+async def _process_page(
+    image_data: bytes, page_num: int, strategy: str
+) -> dict[str, Any]:
+    """Process single page with OCR"""
+
+    if strategy == "tesseract":
+        return await _process_with_tesseract(image_data, page_num)
+    elif strategy == "layoutlm":
+        return await _process_with_layoutlm(image_data, page_num)
+    elif strategy == "hybrid":
+        # Combine both approaches
+        tesseract_result = await _process_with_tesseract(image_data, page_num)
+        layoutlm_result = await _process_with_layoutlm(image_data, page_num)
+
+        return {
+            "page": page_num,
+            "strategy": "hybrid",
+            "tesseract": tesseract_result,
+            "layoutlm": layoutlm_result,
+            "text": tesseract_result.get("text", ""),
+            "confidence": max(
+                tesseract_result.get("confidence", 0),
+                layoutlm_result.get("confidence", 0),
+            ),
+        }
+    else:
+        raise ValueError(f"Unknown strategy: {strategy}")
+
+
+async def _process_with_tesseract(image_data: bytes, page_num: int) -> dict[str, Any]:
+    """Process page with Tesseract OCR"""
+    try:
+        import io
+
+        import pytesseract
+        from PIL import Image
+
+        # Load image
+        image = Image.open(io.BytesIO(image_data))
+
+        # Configure Tesseract
+        config = f"{settings.tesseract_config} -l {settings.languages}"
+
+        # Extract text with confidence
+        data = pytesseract.image_to_data(
+            image, config=config, output_type=pytesseract.Output.DICT
+        )
+
+        # Process results
+        words: list[Any] = []
+        confidences: list[Any] = []
+
+        for i in range(len(data["text"])):
+            if int(data["conf"][i]) > 0:  # Valid confidence
+                word_data = {
+                    "text": data["text"][i],
+                    "confidence": int(data["conf"][i]) / 100.0,
+                    "bbox": [
+                        data["left"][i],
+                        data["top"][i],
+                        data["left"][i] + data["width"][i],
+                        data["top"][i] + data["height"][i],
+                    ],
+                }
+                words.append(word_data)
+                confidences.append(word_data["confidence"])
+
+        # Extract full text
+        full_text = pytesseract.image_to_string(image, config=config)
+
+        return {
+            "page": page_num,
+            "strategy": "tesseract",
+            "text": full_text.strip(),
+            "words": words,
+            "confidence": sum(confidences) / len(confidences) if confidences else 0.0,
+            "word_count": len(words),
+        }
+
+    except ImportError:
+        logger.error("pytesseract not available")
+        return {
+            "page": page_num,
+            "strategy": "tesseract",
+            "error": "pytesseract not available",
+        }
+    except Exception as e:
+        logger.error("Tesseract processing failed", page=page_num, error=str(e))
+        return {"page": page_num, "strategy": "tesseract", "error": str(e)}
+
+
+async def _process_with_layoutlm(image_data: bytes, page_num: int) -> dict[str, Any]:
+    """Process page with LayoutLM"""
+    try:
+        # This would integrate with LayoutLM model
+        # For now, return placeholder
+        logger.warning("LayoutLM processing not implemented")
+
+        return {
+            "page": page_num,
+            "strategy": "layoutlm",
+            "text": "",
+            "layout_elements": [],
+            "confidence": 0.0,
+            "error": "Not implemented",
+        }
+
+    except Exception as e:
+        logger.error("LayoutLM processing failed", page=page_num, error=str(e))
+        return {"page": page_num, "strategy": "layoutlm", "error": str(e)}
+
+
+@app.exception_handler(HTTPException)
+async def http_exception_handler(request: Request, exc: HTTPException) -> JSONResponse:
+    """Handle HTTP exceptions with RFC7807 format"""
+    return JSONResponse(
+        status_code=exc.status_code,
+        content=ErrorResponse(
+            type=f"https://httpstatuses.com/{exc.status_code}",
+            title=exc.detail,
+            status=exc.status_code,
+            detail=exc.detail,
+            instance=str(request.url),
+            trace_id="",
+        ).model_dump(),
+    )
+
+
+if __name__ == "__main__":
+    import uvicorn
+
+    uvicorn.run("main:app", host="0.0.0.0", port=8002, reload=True, log_config=None)
--- a/apps/svc_ocr/requirements.txt
+++ b/apps/svc_ocr/requirements.txt
@@ -0,0 +1,16 @@
+# Service-specific dependencies for svc_ocr
+# NOTE: ML dependencies (transformers, torch, numpy) are in base-ml image
+
+# OCR engines (lightweight)
+pytesseract>=0.3.13
+
+# PDF processing
+PyMuPDF>=1.26.4
+pdf2image>=1.17.0
+
+# Image processing
+Pillow>=11.3.0
+opencv-python-headless>=4.12.0.88  # Headless version is smaller
+
+# Computer vision (torchvision not in base-ml)
+torchvision>=0.23.0
--- a/apps/svc_rag_indexer/Dockerfile
+++ b/apps/svc_rag_indexer/Dockerfile
@@ -0,0 +1,36 @@
+# Dockerfile for svc_rag_indexer - Uses base-ml image
+# Base image contains: FastAPI, database drivers, sentence-transformers, PyTorch, numpy, etc.
+# This Dockerfile only adds service-specific dependencies and application code
+
+ARG REGISTRY=gitea.harkon.co.uk
+ARG OWNER=harkon
+ARG BASE_VERSION=v1.0.1
+FROM ${REGISTRY}/${OWNER}/base-ml:${BASE_VERSION}
+
+# Switch to root to install service-specific dependencies
+USER root
+
+# Set working directory
+WORKDIR /app
+
+# Copy service-specific requirements and install
+COPY apps/svc_rag_indexer/requirements.txt /tmp/service-requirements.txt
+RUN pip install --no-cache-dir -r /tmp/service-requirements.txt
+
+# Copy application code
+COPY libs/ ./libs/
+COPY apps/svc_rag_indexer/ ./apps/svc_rag_indexer/
+
+# Set permissions and switch to non-root user
+RUN chown -R appuser:appuser /app
+USER appuser
+
+# Health check
+HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
+    CMD curl -f http://localhost:8000/healthz || exit 1
+
+# Expose port
+EXPOSE 8000
+
+# Run the application
+CMD ["python", "-m", "uvicorn", "apps.svc_rag_indexer.main:app", "--host", "0.0.0.0", "--port", "8000"]
--- a/apps/svc_rag_indexer/main.py
+++ b/apps/svc_rag_indexer/main.py
@@ -0,0 +1,535 @@
+# FILE: apps/svc-rag-indexer/main.py
+# mypy: disable-error-code=union-attr
+# Vector database indexing with PII protection and de-identification
+
+import os
+
+# Import shared libraries
+import sys
+from datetime import datetime
+from typing import Any
+
+import structlog
+import ulid
+from fastapi import BackgroundTasks, Depends, HTTPException, Request
+from fastapi.responses import JSONResponse
+
+sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
+
+from libs.app_factory import create_app
+from libs.config import BaseAppSettings, create_event_bus, create_qdrant_client
+from libs.events import EventBus, EventPayload, EventTopics
+from libs.observability import get_metrics, get_tracer, setup_observability
+from libs.rag import PIIDetector, QdrantCollectionManager
+from libs.schemas import ErrorResponse
+from libs.security import get_current_user, get_tenant_id
+
+logger = structlog.get_logger()
+
+
+class RAGIndexerSettings(BaseAppSettings):
+    """Settings for RAG indexer service"""
+
+    service_name: str = "svc-rag-indexer"
+
+    # Embedding configuration
+    embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2"
+    embedding_dimension: int = 384
+
+    # Chunking configuration
+    chunk_size: int = 512
+    chunk_overlap: int = 50
+
+    # Collection configuration
+    collections: dict[str, str] = {
+        "documents": "Document chunks with metadata",
+        "tax_rules": "Tax rules and regulations",
+        "case_law": "Tax case law and precedents",
+        "guidance": "HMRC guidance and manuals",
+    }
+
+    # PII protection
+    require_pii_free: bool = True
+    auto_deidentify: bool = True
+
+
+# Create app and settings
+app, settings = create_app(
+    service_name="svc-rag-indexer",
+    title="Tax Agent RAG Indexer Service",
+    description="Vector database indexing with PII protection",
+    settings_class=RAGIndexerSettings,
+)
+
+# Global clients
+qdrant_client = None
+collection_manager: QdrantCollectionManager | None = None
+pii_detector: PIIDetector | None = None
+event_bus: EventBus | None = None
+embedding_model = None
+tracer = get_tracer("svc-rag-indexer")
+metrics = get_metrics()
+
+
+@app.on_event("startup")
+async def startup_event() -> None:
+    """Initialize service dependencies"""
+    global qdrant_client, collection_manager, pii_detector, event_bus, embedding_model
+
+    logger.info("Starting RAG indexer service")
+
+    # Setup observability
+    setup_observability(settings)
+
+    # Initialize Qdrant client
+    qdrant_client = create_qdrant_client(settings)
+    collection_manager = QdrantCollectionManager(qdrant_client)
+
+    # Initialize PII detector
+    pii_detector = PIIDetector()
+
+    # Initialize embedding model
+    try:
+        from sentence_transformers import SentenceTransformer
+
+        embedding_model = SentenceTransformer(settings.embedding_model)
+        logger.info("Embedding model loaded", model=settings.embedding_model)
+    except ImportError:
+        logger.warning("sentence-transformers not available, using mock embeddings")
+        embedding_model = None
+
+    # Initialize event bus
+    event_bus = create_event_bus(settings)
+    await event_bus.start()
+
+    # Subscribe to relevant events
+    await event_bus.subscribe(EventTopics.DOC_EXTRACTED, _handle_document_extracted)  # type: ignore
+    await event_bus.subscribe(EventTopics.KG_UPSERTED, _handle_kg_upserted)  # type: ignore
+
+    # Ensure collections exist
+    for collection_name in settings.collections:
+        await collection_manager.ensure_collection(
+            collection_name=collection_name, vector_size=settings.embedding_dimension
+        )
+
+    logger.info("RAG indexer service started successfully")
+
+
+@app.on_event("shutdown")
+async def shutdown_event() -> None:
+    """Cleanup service dependencies"""
+    global event_bus
+
+    logger.info("Shutting down RAG indexer service")
+
+    if event_bus:
+        await event_bus.stop()
+
+    logger.info("RAG indexer service shutdown complete")
+
+
+@app.get("/health")
+async def health_check() -> dict[str, Any]:
+    """Health check endpoint"""
+    return {
+        "status": "healthy",
+        "service": settings.service_name,
+        "version": settings.service_version,
+        "timestamp": datetime.utcnow().isoformat(),
+        "collections": list(settings.collections.keys()),
+    }
+
+
+@app.post("/index/{collection_name}")
+async def index_document(
+    collection_name: str,
+    document: dict[str, Any],
+    background_tasks: BackgroundTasks,
+    current_user: dict[str, Any] = Depends(get_current_user),
+    tenant_id: str = Depends(get_tenant_id),
+):
+    """Index document in vector database"""
+
+    with tracer.start_as_current_span("index_document") as span:
+        span.set_attribute("collection_name", collection_name)
+        span.set_attribute("tenant_id", tenant_id)
+
+        try:
+            # Validate collection
+            if collection_name not in settings.collections:
+                raise HTTPException(
+                    status_code=400, detail=f"Unknown collection: {collection_name}"
+                )
+
+            # Generate indexing ID
+            indexing_id = str(ulid.new())
+            span.set_attribute("indexing_id", indexing_id)
+
+            # Start background indexing
+            background_tasks.add_task(
+                _index_document_async,
+                collection_name,
+                document,
+                tenant_id,
+                indexing_id,
+                current_user.get("sub", "system"),
+            )
+
+            logger.info(
+                "Document indexing started",
+                collection=collection_name,
+                indexing_id=indexing_id,
+            )
+
+            return {
+                "indexing_id": indexing_id,
+                "collection": collection_name,
+                "status": "indexing",
+            }
+
+        except HTTPException:
+            raise
+        except Exception as e:
+            logger.error(
+                "Failed to start indexing", collection=collection_name, error=str(e)
+            )
+            raise HTTPException(status_code=500, detail="Failed to start indexing")
+
+
+@app.get("/collections")
+async def list_collections(
+    current_user: dict[str, Any] = Depends(get_current_user),
+    tenant_id: str = Depends(get_tenant_id),
+):
+    """List available collections"""
+
+    try:
+        collections_info: list[Any] = []
+
+        for collection_name, description in settings.collections.items():
+            # Get collection info from Qdrant
+            try:
+                collection_info = qdrant_client.get_collection(collection_name)
+                point_count = collection_info.points_count
+                vector_count = collection_info.vectors_count
+            except Exception:
+                point_count = 0
+                vector_count = 0
+
+            collections_info.append(
+                {
+                    "name": collection_name,
+                    "description": description,
+                    "point_count": point_count,
+                    "vector_count": vector_count,
+                }
+            )
+
+        return {
+            "collections": collections_info,
+            "total_collections": len(collections_info),
+        }
+
+    except Exception as e:
+        logger.error("Failed to list collections", error=str(e))
+        raise HTTPException(status_code=500, detail="Failed to list collections")
+
+
+async def _handle_document_extracted(topic: str, payload: EventPayload) -> None:
+    """Handle document extraction completion events"""
+    try:
+        data = payload.data
+        doc_id = data.get("doc_id")
+        tenant_id = data.get("tenant_id")
+        extraction_results = data.get("extraction_results")
+
+        if not doc_id or not tenant_id or not extraction_results:
+            logger.warning("Invalid document extraction event", data=data)
+            return
+
+        logger.info("Auto-indexing extracted document", doc_id=doc_id)
+
+        # Create document for indexing
+        document = {
+            "doc_id": doc_id,
+            "content": _extract_content_from_results(extraction_results),
+            "metadata": {
+                "doc_id": doc_id,
+                "tenant_id": tenant_id,
+                "extraction_id": extraction_results.get("extraction_id"),
+                "confidence": extraction_results.get("confidence", 0.0),
+                "extracted_at": extraction_results.get("extracted_at"),
+                "source": "extraction",
+            },
+        }
+
+        await _index_document_async(
+            collection_name="documents",
+            document=document,
+            tenant_id=tenant_id,
+            indexing_id=str(ulid.new()),
+            actor=payload.actor,
+        )
+
+    except Exception as e:
+        logger.error("Failed to handle document extraction event", error=str(e))
+
+
+async def _handle_kg_upserted(topic: str, payload: EventPayload) -> None:
+    """Handle knowledge graph upsert events"""
+    try:
+        data = payload.data
+        entities = data.get("entities", [])
+        tenant_id = data.get("tenant_id")
+
+        if not entities or not tenant_id:
+            logger.warning("Invalid KG upsert event", data=data)
+            return
+
+        logger.info("Auto-indexing KG entities", count=len(entities))
+
+        # Index entities as documents
+        for entity in entities:
+            document = {
+                "entity_id": entity.get("id"),
+                "content": _extract_content_from_entity(entity),
+                "metadata": {
+                    "entity_type": entity.get("type"),
+                    "entity_id": entity.get("id"),
+                    "tenant_id": tenant_id,
+                    "source": "knowledge_graph",
+                },
+            }
+
+            await _index_document_async(
+                collection_name="documents",
+                document=document,
+                tenant_id=tenant_id,
+                indexing_id=str(ulid.new()),
+                actor=payload.actor,
+            )
+
+    except Exception as e:
+        logger.error("Failed to handle KG upsert event", error=str(e))
+
+
+async def _index_document_async(
+    collection_name: str,
+    document: dict[str, Any],
+    tenant_id: str,
+    indexing_id: str,
+    actor: str,
+):
+    """Index document asynchronously"""
+
+    with tracer.start_as_current_span("index_document_async") as span:
+        span.set_attribute("collection_name", collection_name)
+        span.set_attribute("indexing_id", indexing_id)
+        span.set_attribute("tenant_id", tenant_id)
+
+        try:
+            content = document.get("content", "")
+            metadata = document.get("metadata", {})
+
+            # Check for PII and de-identify if needed
+            if settings.require_pii_free:
+                has_pii = pii_detector.has_pii(content)
+
+                if has_pii:
+                    if settings.auto_deidentify:
+                        content, pii_mapping = pii_detector.de_identify_text(content)
+                        metadata["pii_removed"] = True
+                        metadata["pii_mapping_hash"] = _hash_pii_mapping(pii_mapping)
+                        logger.info("PII removed from content", indexing_id=indexing_id)
+                    else:
+                        logger.warning(
+                            "Content contains PII, skipping indexing",
+                            indexing_id=indexing_id,
+                        )
+                        return
+
+            # Mark as PII-free
+            metadata["pii_free"] = True
+            metadata["tenant_id"] = tenant_id
+            metadata["indexed_at"] = datetime.utcnow().isoformat()
+
+            # Chunk content
+            chunks = _chunk_text(content)
+
+            # Generate embeddings and index chunks
+            indexed_chunks = 0
+            for i, chunk in enumerate(chunks):
+                try:
+                    # Generate embedding
+                    embedding = await _generate_embedding(chunk)
+
+                    # Create point
+                    point_id = f"{indexing_id}_{i}"
+
+                    from qdrant_client.models import PointStruct
+
+                    point = PointStruct(
+                        id=point_id,
+                        vector=embedding,
+                        payload={
+                            **metadata,
+                            "chunk_text": chunk,
+                            "chunk_index": i,
+                            "total_chunks": len(chunks),
+                        },
+                    )
+
+                    # Index point
+                    success = await collection_manager.upsert_points(
+                        collection_name, [point]
+                    )
+
+                    if success:
+                        indexed_chunks += 1
+
+                except Exception as e:
+                    logger.error("Failed to index chunk", chunk_index=i, error=str(e))
+
+            # Update metrics
+            metrics.counter("documents_indexed_total").labels(
+                tenant_id=tenant_id, collection=collection_name
+            ).inc()
+
+            metrics.histogram("chunks_per_document").labels(
+                collection=collection_name
+            ).observe(indexed_chunks)
+
+            # Publish completion event
+            event_payload = EventPayload(
+                data={
+                    "indexing_id": indexing_id,
+                    "collection": collection_name,
+                    "tenant_id": tenant_id,
+                    "chunks_indexed": indexed_chunks,
+                    "total_chunks": len(chunks),
+                },
+                actor=actor,
+                tenant_id=tenant_id,
+            )
+
+            await event_bus.publish(EventTopics.RAG_INDEXED, event_payload)
+
+            logger.info(
+                "Document indexing completed",
+                indexing_id=indexing_id,
+                chunks=indexed_chunks,
+            )
+
+        except Exception as e:
+            logger.error(
+                "Document indexing failed", indexing_id=indexing_id, error=str(e)
+            )
+
+            # Update error metrics
+            metrics.counter("indexing_errors_total").labels(
+                tenant_id=tenant_id,
+                collection=collection_name,
+                error_type=type(e).__name__,
+            ).inc()
+
+
+def _extract_content_from_results(extraction_results: dict[str, Any]) -> str:
+    """Extract text content from extraction results"""
+    content_parts: list[Any] = []
+
+    # Add extracted fields
+    extracted_fields = extraction_results.get("extracted_fields", {})
+    for field_name, field_value in extracted_fields.items():
+        content_parts.append(f"{field_name}: {field_value}")
+
+    return "\n".join(content_parts)
+
+
+def _extract_content_from_entity(entity: dict[str, Any]) -> str:
+    """Extract text content from KG entity"""
+    content_parts: list[Any] = []
+
+    # Add entity type and ID
+    entity_type = entity.get("type", "Unknown")
+    entity_id = entity.get("id", "")
+    content_parts.append(f"Entity Type: {entity_type}")
+    content_parts.append(f"Entity ID: {entity_id}")
+
+    # Add properties
+    properties = entity.get("properties", {})
+    for prop_name, prop_value in properties.items():
+        if prop_name not in ["tenant_id", "asserted_at", "retracted_at"]:
+            content_parts.append(f"{prop_name}: {prop_value}")
+
+    return "\n".join(content_parts)
+
+
+def _chunk_text(text: str) -> list[str]:
+    """Chunk text into smaller pieces"""
+    if not text:
+        return []
+
+    # Simple chunking by sentences/paragraphs
+    chunks: list[Any] = []
+    current_chunk = ""
+
+    sentences = text.split(". ")
+
+    for sentence in sentences:
+        if len(current_chunk) + len(sentence) < settings.chunk_size:
+            current_chunk += sentence + ". "
+        else:
+            if current_chunk:
+                chunks.append(current_chunk.strip())
+            current_chunk = sentence + ". "
+
+    if current_chunk:
+        chunks.append(current_chunk.strip())
+
+    return chunks
+
+
+async def _generate_embedding(text: str) -> list[float]:
+    """Generate embedding for text"""
+    if embedding_model:
+        try:
+            embedding = embedding_model.encode(text)
+            return embedding.tolist()
+        except Exception as e:
+            logger.error("Failed to generate embedding", error=str(e))
+
+    # Fallback: random embedding
+    import random
+
+    return [random.random() for _ in range(settings.embedding_dimension)]
+
+
+def _hash_pii_mapping(pii_mapping: dict[str, str]) -> str:
+    """Create hash of PII mapping for audit purposes"""
+    import hashlib
+    import json
+
+    mapping_json = json.dumps(pii_mapping, sort_keys=True)
+    return hashlib.sha256(mapping_json.encode()).hexdigest()
+
+
+@app.exception_handler(HTTPException)
+async def http_exception_handler(request: Request, exc: HTTPException) -> JSONResponse:
+    """Handle HTTP exceptions with RFC7807 format"""
+    return JSONResponse(
+        status_code=exc.status_code,
+        content=ErrorResponse(
+            type=f"https://httpstatuses.com/{exc.status_code}",
+            title=exc.detail,
+            status=exc.status_code,
+            detail=exc.detail,
+            instance=str(request.url),
+            trace_id="",
+        ).model_dump(),
+    )
+
+
+if __name__ == "__main__":
+    import uvicorn
+
+    uvicorn.run("main:app", host="0.0.0.0", port=8006, reload=True, log_config=None)
--- a/apps/svc_rag_indexer/requirements.txt
+++ b/apps/svc_rag_indexer/requirements.txt
@@ -0,0 +1,19 @@
+# Service-specific dependencies for svc_rag_indexer
+# NOTE: ML dependencies (sentence-transformers, transformers, torch, numpy) are in base-ml image
+
+# Text chunking (lightweight alternative to langchain)
+tiktoken>=0.11.0
+
+# Text preprocessing (lightweight)
+beautifulsoup4>=4.14.2
+
+# Text similarity (CPU-only)
+faiss-cpu>=1.12.0
+
+# Document processing (lightweight)
+python-docx>=1.2.0
+python-pptx>=1.0.2
+openpyxl>=3.1.5
+
+# Sparse vector processing
+sparse-dot-topn>=1.1.5
--- a/apps/svc_rag_retriever/Dockerfile
+++ b/apps/svc_rag_retriever/Dockerfile
@@ -0,0 +1,36 @@
+# Dockerfile for svc_rag_retriever - Uses base-ml image
+# Base image contains: FastAPI, database drivers, sentence-transformers, PyTorch, etc.
+# This Dockerfile only adds service-specific dependencies and application code
+
+ARG REGISTRY=gitea.harkon.co.uk
+ARG OWNER=harkon
+ARG BASE_VERSION=v1.0.1
+FROM ${REGISTRY}/${OWNER}/base-ml:${BASE_VERSION}
+
+# Switch to root to install service-specific dependencies
+USER root
+
+# Set working directory
+WORKDIR /app
+
+# Copy service-specific requirements and install
+COPY apps/svc_rag_retriever/requirements.txt /tmp/service-requirements.txt
+RUN pip install --no-cache-dir -r /tmp/service-requirements.txt
+
+# Copy application code
+COPY libs/ ./libs/
+COPY apps/svc_rag_retriever/ ./apps/svc_rag_retriever/
+
+# Set permissions and switch to non-root user
+RUN chown -R appuser:appuser /app
+USER appuser
+
+# Health check
+HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
+    CMD curl -f http://localhost:8000/healthz || exit 1
+
+# Expose port
+EXPOSE 8000
+
+# Run the application
+CMD ["python", "-m", "uvicorn", "apps.svc_rag_retriever.main:app", "--host", "0.0.0.0", "--port", "8000"]
--- a/apps/svc_rag_retriever/main.py
+++ b/apps/svc_rag_retriever/main.py
@@ -0,0 +1,476 @@
+# FILE: apps/svc-rag-retriever/main.py
+# mypy: disable-error-code=union-attr
+# Hybrid search with KG fusion, reranking, and calibrated confidence
+
+import os
+
+# Import shared libraries
+import sys
+from datetime import datetime
+from typing import Any
+
+import structlog
+from fastapi import Depends, HTTPException, Query, Request
+from fastapi.responses import JSONResponse
+from qdrant_client.models import SparseVector
+
+sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
+
+from libs.app_factory import create_app
+from libs.calibration import ConfidenceCalibrator
+from libs.config import (
+    BaseAppSettings,
+    create_event_bus,
+    create_neo4j_client,
+    create_qdrant_client,
+)
+from libs.events import EventBus
+from libs.neo import Neo4jClient
+from libs.observability import get_metrics, get_tracer, setup_observability
+from libs.rag import RAGRetriever
+from libs.schemas import ErrorResponse, RAGSearchRequest, RAGSearchResponse
+from libs.security import get_current_user, get_tenant_id
+
+logger = structlog.get_logger()
+
+
+class RAGRetrieverSettings(BaseAppSettings):
+    """Settings for RAG retriever service"""
+
+    service_name: str = "svc-rag-retriever"
+
+    # Embedding configuration
+    embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2"
+    embedding_dimension: int = 384
+
+    # Search configuration
+    default_k: int = 10
+    max_k: int = 100
+    alpha: float = 0.5  # Dense/sparse balance
+    beta: float = 0.3  # Vector/KG balance
+    gamma: float = 0.2  # Reranking weight
+
+    # Collections to search
+    search_collections: list[str] = ["documents", "tax_rules", "guidance"]
+
+    # Reranking
+    reranker_model: str | None = None
+    rerank_top_k: int = 50
+
+
+# Create app and settings
+app, settings = create_app(
+    service_name="svc-rag-retriever",
+    title="Tax Agent RAG Retriever Service",
+    description="Hybrid search with KG fusion and reranking",
+    settings_class=RAGRetrieverSettings,
+)
+
+# Global clients
+qdrant_client = None
+neo4j_client: Neo4jClient | None = None
+rag_retriever: RAGRetriever | None = None
+event_bus: EventBus | None = None
+embedding_model = None
+confidence_calibrator: ConfidenceCalibrator | None = None
+tracer = get_tracer("svc-rag-retriever")
+metrics = get_metrics()
+
+
+@app.on_event("startup")
+async def startup_event() -> None:
+    """Initialize service dependencies"""
+    global qdrant_client, neo4j_client, rag_retriever, event_bus, embedding_model, confidence_calibrator
+
+    logger.info("Starting RAG retriever service")
+
+    # Setup observability
+    setup_observability(settings)
+
+    # Initialize Qdrant client
+    qdrant_client = create_qdrant_client(settings)
+
+    # Initialize Neo4j client
+    neo4j_driver = create_neo4j_client(settings)
+    neo4j_client = Neo4jClient(neo4j_driver)
+
+    # Initialize RAG retriever
+    rag_retriever = RAGRetriever(
+        qdrant_client=qdrant_client,
+        neo4j_client=neo4j_client,
+        reranker_model=settings.reranker_model,
+    )
+
+    # Initialize embedding model
+    try:
+        from sentence_transformers import SentenceTransformer
+
+        embedding_model = SentenceTransformer(settings.embedding_model)
+        logger.info("Embedding model loaded", model=settings.embedding_model)
+    except ImportError:
+        logger.warning("sentence-transformers not available, using mock embeddings")
+        embedding_model = None
+
+    # Initialize confidence calibrator
+    confidence_calibrator = ConfidenceCalibrator(method="isotonic")
+
+    # Initialize event bus
+    event_bus = create_event_bus(settings)
+    await event_bus.start() # fmt: skip # pyright: ignore[reportOptionalMemberAccess]
+
+    logger.info("RAG retriever service started successfully")
+
+
+@app.on_event("shutdown")
+async def shutdown_event() -> None:
+    """Cleanup service dependencies"""
+    global neo4j_client, event_bus
+
+    logger.info("Shutting down RAG retriever service")
+
+    if neo4j_client:
+        await neo4j_client.close()
+
+    if event_bus:
+        await event_bus.stop()
+
+    logger.info("RAG retriever service shutdown complete")
+
+
+@app.get("/health")
+async def health_check() -> dict[str, Any]:
+    """Health check endpoint"""
+    return {
+        "status": "healthy",
+        "service": settings.service_name,
+        "version": settings.service_version,
+        "timestamp": datetime.utcnow().isoformat(),
+        "search_collections": settings.search_collections,
+    }
+
+
+@app.post("/search", response_model=RAGSearchResponse)
+async def search(
+    request_data: RAGSearchRequest,
+    current_user: dict[str, Any] = Depends(get_current_user),
+    tenant_id: str = Depends(get_tenant_id),
+) -> RAGSearchResponse:
+    """Perform hybrid RAG search"""
+
+    with tracer.start_as_current_span("rag_search") as span:
+        span.set_attribute("query", request_data.query[:100])
+        span.set_attribute("tenant_id", tenant_id)
+        span.set_attribute("k", request_data.k)
+
+        try:
+            # Generate embeddings for query
+            dense_vector = await _generate_embedding(request_data.query)
+            sparse_vector = await _generate_sparse_vector(request_data.query)
+
+            # Perform search
+            search_results = await rag_retriever.search(  # fmt: skip # pyright: ignore[reportOptionalMemberAccess]
+                query=request_data.query,
+                collections=settings.search_collections,
+                dense_vector=dense_vector,
+                sparse_vector=sparse_vector,
+                k=request_data.k,
+                alpha=settings.alpha,
+                beta=settings.beta,
+                gamma=settings.gamma,
+                tax_year=request_data.tax_year,
+                jurisdiction=request_data.jurisdiction,
+            )
+
+            # Update metrics
+            metrics.counter("searches_total").labels(tenant_id=tenant_id).inc()
+
+            metrics.histogram("search_results_count").labels(
+                tenant_id=tenant_id
+            ).observe(len(search_results["chunks"]))
+
+            metrics.histogram("search_confidence").labels(tenant_id=tenant_id).observe(
+                search_results["calibrated_confidence"]
+            )
+
+            logger.info(
+                "RAG search completed",
+                query=request_data.query[:50],
+                results=len(search_results["chunks"]),
+                confidence=search_results["calibrated_confidence"],
+            )
+
+            return RAGSearchResponse(
+                chunks=search_results["chunks"],
+                citations=search_results["citations"],
+                kg_hints=search_results["kg_hints"],
+                calibrated_confidence=search_results["calibrated_confidence"],
+            )
+
+        except Exception as e:
+            logger.error(
+                "RAG search failed", query=request_data.query[:50], error=str(e)
+            )
+
+            # Update error metrics
+            metrics.counter("search_errors_total").labels(
+                tenant_id=tenant_id, error_type=type(e).__name__
+            ).inc()
+
+            raise HTTPException(status_code=500, detail=f"Search failed: {str(e)}")
+
+
+@app.get("/similar/{doc_id}")
+async def find_similar_documents(
+    doc_id: str,
+    k: int = Query(default=10, le=settings.max_k),
+    current_user: dict[str, Any] = Depends(get_current_user),
+    tenant_id: str = Depends(get_tenant_id),
+) -> dict[str, Any]:
+    """Find documents similar to given document"""
+
+    with tracer.start_as_current_span("find_similar") as span:
+        span.set_attribute("doc_id", doc_id)
+        span.set_attribute("tenant_id", tenant_id)
+        span.set_attribute("k", k)
+
+        try:
+            # Get document content from vector database
+            # This would search for the document by doc_id in metadata
+            from qdrant_client.models import FieldCondition, Filter, MatchValue
+
+            filter_conditions = Filter(
+                must=[
+                    FieldCondition(key="doc_id", match=MatchValue(value=doc_id)),
+                    FieldCondition(key="tenant_id", match=MatchValue(value=tenant_id)),
+                ]
+            )
+
+            # Search for the document
+            doc_results = await rag_retriever.collection_manager.search_dense(  # fmt: skip # pyright: ignore[reportOptionalMemberAccess]
+                collection_name="documents",
+                query_vector=[0.0] * settings.embedding_dimension,  # Dummy vector
+                limit=1,
+                filter_conditions=filter_conditions,
+            )
+
+            if not doc_results:
+                raise HTTPException(status_code=404, detail="Document not found")
+
+            # Get the document's vector and use it for similarity search
+            doc_vector = doc_results[0]["payload"].get("vector")
+            if not doc_vector:
+                raise HTTPException(status_code=400, detail="Document has no vector")
+
+            # Find similar documents
+            similar_results = await rag_retriever.collection_manager.search_dense(  # fmt: skip # pyright: ignore[reportOptionalMemberAccess]
+                collection_name="documents",
+                query_vector=doc_vector,
+                limit=k + 1,  # +1 to exclude the original document
+                filter_conditions=Filter(
+                    must=[
+                        FieldCondition(
+                            key="tenant_id", match=MatchValue(value=tenant_id)
+                        )
+                    ],
+                    must_not=[
+                        FieldCondition(key="doc_id", match=MatchValue(value=doc_id))
+                    ],
+                ),
+            )
+
+            return {
+                "doc_id": doc_id,
+                "similar_documents": similar_results[:k],
+                "count": len(similar_results[:k]),
+            }
+
+        except HTTPException:
+            raise
+        except Exception as e:
+            logger.error("Similar document search failed", doc_id=doc_id, error=str(e))
+            raise HTTPException(
+                status_code=500, detail=f"Similar search failed: {str(e)}"
+            )
+
+
+@app.post("/explain")
+async def explain_search(
+    query: str,
+    search_results: list[dict[str, Any]],
+    current_user: dict[str, Any] = Depends(get_current_user),
+    tenant_id: str = Depends(get_tenant_id),
+) -> dict[str, Any]:
+    """Explain search results and ranking"""
+
+    with tracer.start_as_current_span("explain_search") as span:
+        span.set_attribute("query", query[:100])
+        span.set_attribute("tenant_id", tenant_id)
+        span.set_attribute("results_count", len(search_results))
+
+        try:
+            explanations = []
+
+            for i, result in enumerate(search_results):
+                explanation = {
+                    "rank": i + 1,
+                    "chunk_id": result.get("id"),
+                    "score": result.get("score", 0.0),
+                    "dense_score": result.get("dense_score", 0.0),
+                    "sparse_score": result.get("sparse_score", 0.0),
+                    "collection": result.get("collection"),
+                    "explanation": _generate_explanation(query, result),
+                }
+                explanations.append(explanation)
+
+            return {
+                "query": query,
+                "explanations": explanations,
+                "ranking_factors": {
+                    "alpha": settings.alpha,
+                    "beta": settings.beta,
+                    "gamma": settings.gamma,
+                },
+            }
+
+        except Exception as e:
+            logger.error("Search explanation failed", error=str(e))
+            raise HTTPException(status_code=500, detail=f"Explanation failed: {str(e)}")
+
+
+async def _generate_embedding(text: str) -> list[float]:
+    """Generate dense embedding for text"""
+    if embedding_model:
+        try:
+            embedding = embedding_model.encode(text)
+            return embedding.tolist()
+        except Exception as e:
+            logger.error("Failed to generate embedding", error=str(e))
+
+    # Fallback: random embedding
+    import random
+
+    return [random.random() for _ in range(settings.embedding_dimension)]
+
+
+async def _generate_sparse_vector(text: str) -> SparseVector:
+    """Generate sparse vector for text (BM25-style)"""
+    try:
+        # This would use a proper sparse encoder like SPLADE
+        # For now, create a simple sparse representation
+        from qdrant_client.models import SparseVector
+
+        # Simple word-based sparse vector
+        words = text.lower().split()
+        word_counts: dict[str, int] = {}
+        for word in words:
+            word_counts[word] = word_counts.get(word, 0) + 1
+
+        # Convert to sparse vector format
+        indices = []
+        values = []
+
+        for _i, (word, count) in enumerate(word_counts.items()):
+            # Use hash of word as index
+            word_hash = hash(word) % 10000  # Limit vocabulary size
+            indices.append(word_hash)
+            values.append(float(count))
+
+        return SparseVector(indices=indices, values=values)
+
+    except Exception as e:
+        logger.error("Failed to generate sparse vector", error=str(e))
+        # Return empty sparse vector
+        from qdrant_client.models import SparseVector
+
+        return SparseVector(indices=[], values=[])
+
+
+def _generate_explanation(query: str, result: dict[str, Any]) -> str:
+    """Generate human-readable explanation for search result"""
+
+    explanations = []
+
+    # Score explanation
+    score = result.get("score", 0.0)
+    dense_score = result.get("dense_score", 0.0)
+    sparse_score = result.get("sparse_score", 0.0)
+
+    explanations.append(f"Overall score: {score:.3f}")
+
+    if dense_score > 0:
+        explanations.append(f"Semantic similarity: {dense_score:.3f}")
+
+    if sparse_score > 0:
+        explanations.append(f"Keyword match: {sparse_score:.3f}")
+
+    # Collection explanation
+    collection = result.get("collection")
+    if collection:
+        explanations.append(f"Source: {collection}")
+
+    # Metadata explanation
+    payload = result.get("payload", {})
+    doc_id = payload.get("doc_id")
+    if doc_id:
+        explanations.append(f"Document: {doc_id}")
+
+    confidence = payload.get("confidence")
+    if confidence:
+        explanations.append(f"Extraction confidence: {confidence:.3f}")
+
+    return "; ".join(explanations)
+
+
+@app.get("/stats")
+async def get_search_stats(
+    current_user: dict[str, Any] = Depends(get_current_user),
+    tenant_id: str = Depends(get_tenant_id),
+) -> dict[str, Any]:
+    """Get search statistics"""
+
+    try:
+        # This would aggregate metrics from Prometheus
+        # For now, return mock stats
+        stats = {
+            "total_searches": 1000,
+            "avg_results_per_search": 8.5,
+            "avg_confidence": 0.75,
+            "collections": {
+                "documents": {"searches": 800, "avg_confidence": 0.78},
+                "tax_rules": {"searches": 150, "avg_confidence": 0.85},
+                "guidance": {"searches": 50, "avg_confidence": 0.70},
+            },
+            "top_queries": [
+                {"query": "capital gains tax", "count": 45},
+                {"query": "business expenses", "count": 38},
+                {"query": "property income", "count": 32},
+            ],
+        }
+
+        return stats
+
+    except Exception as e:
+        logger.error("Failed to get search stats", error=str(e))
+        raise HTTPException(status_code=500, detail="Failed to get stats")
+
+
+@app.exception_handler(HTTPException)
+async def http_exception_handler(request: Request, exc: HTTPException) -> JSONResponse:
+    """Handle HTTP exceptions with RFC7807 format"""
+    return JSONResponse(
+        status_code=exc.status_code,
+        content=ErrorResponse(
+            type=f"https://httpstatuses.com/{exc.status_code}",
+            title=exc.detail,
+            status=exc.status_code,
+            detail=exc.detail,
+            instance=str(request.url),
+            trace_id=getattr(request.state, "trace_id", None),
+        ).dict(),
+    )
+
+
+if __name__ == "__main__":
+    import uvicorn
+
+    uvicorn.run("main:app", host="0.0.0.0", port=8007, reload=True, log_config=None)
--- a/apps/svc_rag_retriever/requirements.txt
+++ b/apps/svc_rag_retriever/requirements.txt
@@ -0,0 +1,11 @@
+# Service-specific dependencies for svc_rag_retriever
+# NOTE: ML dependencies (sentence-transformers, transformers, torch, numpy) are in base-ml image
+
+# Search and ranking (lightweight)
+rank-bm25>=0.2.2
+
+# Vector similarity (CPU-only, lighter than GPU version)
+faiss-cpu>=1.12.0
+
+# Sparse retrieval
+sparse-dot-topn>=1.1.5
--- a/apps/svc_reason/Dockerfile
+++ b/apps/svc_reason/Dockerfile
@@ -0,0 +1,53 @@
+# Multi-stage build for svc_reason
+FROM python:3.12-slim AS builder
+
+# Install build dependencies
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    curl \
+    && rm -rf /var/lib/apt/lists/*
+
+# Create virtual environment
+RUN python -m venv /opt/venv
+ENV PATH="/opt/venv/bin:$PATH"
+
+# Copy requirements and install dependencies
+COPY libs/requirements-base.txt /tmp/libs-requirements.txt
+COPY apps/svc_reason/requirements.txt /tmp/requirements.txt
+RUN pip install --no-cache-dir --upgrade pip && \
+    pip install --no-cache-dir -r /tmp/libs-requirements.txt -r /tmp/requirements.txt
+
+# Production stage
+FROM python:3.12-slim
+
+# Install runtime dependencies
+RUN apt-get update && apt-get install -y \
+    curl \
+    && rm -rf /var/lib/apt/lists/* \
+    && groupadd -r appuser \
+    && useradd -r -g appuser appuser
+
+# Copy virtual environment from builder
+COPY --from=builder /opt/venv /opt/venv
+ENV PATH="/opt/venv/bin:$PATH"
+
+# Set working directory
+WORKDIR /app
+
+# Copy application code
+COPY libs/ ./libs/
+COPY apps/svc_reason/ ./apps/svc_reason/
+
+# Create non-root user and set permissions
+RUN chown -R appuser:appuser /app
+USER appuser
+
+# Health check
+HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
+    CMD curl -f http://localhost:8000/healthz || exit 1
+
+# Expose port
+EXPOSE 8000
+
+# Run the application
+CMD ["python", "-m", "uvicorn", "apps.svc_reason.main:app", "--host", "0.0.0.0", "--port", "8000"]
--- a/apps/svc_reason/main.py
+++ b/apps/svc_reason/main.py
@@ -0,0 +1,677 @@
+"""Tax calculation engine with schedule computation and evidence trails."""
+
+# mypy: disable-error-code=union-attr
+
+# FILE: apps/svc-reason/main.py
+# pylint: disable=wrong-import-position,import-error,too-few-public-methods,global-statement
+# pylint: disable=global-variable-not-assigned,raise-missing-from,unused-argument
+# pylint: disable=broad-exception-caught,no-else-return,too-many-arguments,too-many-positional-arguments
+# pylint: disable=too-many-locals,import-outside-toplevel,too-many-statements
+
+
+import os
+
+# Import shared libraries
+import sys
+from datetime import datetime
+from decimal import Decimal
+from typing import Any
+
+import structlog
+import ulid
+from fastapi import BackgroundTasks, Depends, HTTPException, Request
+from fastapi.responses import JSONResponse
+
+sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
+
+from libs.app_factory import create_app
+from libs.config import BaseAppSettings, create_event_bus, create_neo4j_client
+from libs.events import EventBus, EventPayload, EventTopics
+from libs.neo import Neo4jClient
+from libs.observability import get_metrics, get_tracer, setup_observability
+from libs.schemas import ErrorResponse, ScheduleComputeRequest, ScheduleComputeResponse
+from libs.security import get_current_user, get_tenant_id
+
+logger = structlog.get_logger()
+
+
+class ReasonSettings(BaseAppSettings):
+    """Settings for reasoning service"""
+
+    service_name: str = "svc-reason"
+
+    # Tax year configuration
+    current_tax_year: str = "2023-24"
+    supported_tax_years: list[str] = ["2021-22", "2022-23", "2023-24", "2024-25"]
+
+    # Calculation configuration
+    precision: int = 2  # Decimal places
+    rounding_method: str = "ROUND_HALF_UP"
+
+    # Schedule support
+    supported_schedules: list[str] = ["SA100", "SA103", "SA105", "SA106"]
+
+    # Validation
+    max_income: float = 10000000.0  # £10M
+    max_expenses: float = 10000000.0  # £10M
+
+
+# Create app and settings
+app, settings = create_app(
+    service_name="svc-reason",
+    title="Tax Agent Reasoning Service",
+    description="Tax calculation engine with schedule computation",
+    settings_class=ReasonSettings,
+)
+
+# Global clients
+neo4j_client: Neo4jClient | None = None
+event_bus: EventBus | None = None
+tracer = get_tracer("svc-reason")
+metrics = get_metrics()
+
+
+@app.on_event("startup")
+async def startup_event() -> None:
+    """Initialize service dependencies"""
+    global neo4j_client, event_bus
+
+    logger.info("Starting reasoning service")
+
+    # Setup observability
+    setup_observability(settings)
+
+    # Initialize Neo4j client
+    neo4j_driver = create_neo4j_client(settings)
+    neo4j_client = Neo4jClient(neo4j_driver)
+
+    # Initialize event bus
+    event_bus = create_event_bus(settings)
+    await event_bus.start() # fmt: skip# pyright: ignore[reportOptionalMemberAccess]
+
+    # Subscribe to KG upsert events
+    await event_bus.subscribe(EventTopics.KG_UPSERTED, _handle_kg_upserted)  # type: ignore
+
+    logger.info("Reasoning service started successfully")
+
+
+@app.on_event("shutdown")
+async def shutdown_event() -> None:
+    """Cleanup service dependencies"""
+    global neo4j_client, event_bus
+
+    logger.info("Shutting down reasoning service")
+
+    if neo4j_client:
+        await neo4j_client.close()
+
+    if event_bus:
+        await event_bus.stop()
+
+    logger.info("Reasoning service shutdown complete")
+
+
+@app.get("/health")
+async def health_check() -> dict[str, Any]:
+    """Health check endpoint"""
+    return {
+        "status": "healthy",
+        "service": settings.service_name,
+        "version": settings.service_version,
+        "timestamp": datetime.utcnow().isoformat(),
+        "supported_schedules": settings.supported_schedules,
+    }
+
+
+@app.post("/compute", response_model=ScheduleComputeResponse)
+async def compute_schedule(
+    request_data: ScheduleComputeRequest,
+    background_tasks: BackgroundTasks,
+    current_user: dict[str, Any] = Depends(get_current_user()),
+    tenant_id: str = Depends(get_tenant_id()),
+) -> ScheduleComputeResponse:
+    """Compute tax schedule"""
+
+    with tracer.start_as_current_span("compute_schedule") as span:
+        span.set_attribute("tax_year", request_data.tax_year)
+        span.set_attribute("taxpayer_id", request_data.taxpayer_id)
+        span.set_attribute("schedule_id", request_data.schedule_id)
+        span.set_attribute("tenant_id", tenant_id)
+
+        try:
+            # Validate inputs
+            if request_data.tax_year not in settings.supported_tax_years:
+                raise HTTPException(
+                    status_code=400,
+                    detail=f"Unsupported tax year: {request_data.tax_year}",
+                )
+
+            if request_data.schedule_id not in settings.supported_schedules:
+                raise HTTPException(
+                    status_code=400,
+                    detail=f"Unsupported schedule: {request_data.schedule_id}",
+                )
+
+            # Generate calculation ID
+            calculation_id = str(ulid.new())
+            span.set_attribute("calculation_id", calculation_id)
+
+            # Start background computation
+            background_tasks.add_task(
+                _compute_schedule_async,
+                request_data.tax_year,
+                request_data.taxpayer_id,
+                request_data.schedule_id,
+                tenant_id,
+                calculation_id,
+                current_user.get("sub", "system"),
+            )
+
+            logger.info(
+                "Schedule computation started",
+                calculation_id=calculation_id,
+                schedule=request_data.schedule_id,
+            )
+
+            return ScheduleComputeResponse(
+                calculation_id=calculation_id,
+                schedule=request_data.schedule_id,
+                form_boxes={},  # Will be populated when computation completes
+                evidence_trail=[],
+            )
+
+        except HTTPException:
+            raise
+        except Exception as e:
+            logger.error("Failed to start computation", error=str(e))
+            raise HTTPException(status_code=500, detail="Failed to start computation")
+
+
+@app.get("/calculations/{calculation_id}")
+async def get_calculation_results(
+    calculation_id: str,
+    current_user: dict[str, Any] = Depends(get_current_user()),
+    tenant_id: str = Depends(get_tenant_id()),
+) -> dict[str, Any]:
+    """Get calculation results"""
+
+    with tracer.start_as_current_span("get_calculation_results") as span:
+        span.set_attribute("calculation_id", calculation_id)
+        span.set_attribute("tenant_id", tenant_id)
+
+        try:
+            # Query calculation from Neo4j
+            query = """
+            MATCH (c:Calculation {calculation_id: $calculation_id, tenant_id: $tenant_id})
+            WHERE c.retracted_at IS NULL
+            RETURN c
+            """
+
+            results = await neo4j_client.run_query(  # pyright: ignore[reportOptionalMemberAccess]
+                query, {"calculation_id": calculation_id, "tenant_id": tenant_id}
+            )
+
+            if not results:
+                raise HTTPException(status_code=404, detail="Calculation not found")
+
+            calculation = results[0]["c"]
+
+            # Get form boxes
+            form_boxes_query = """
+            MATCH (c:Calculation {calculation_id: $calculation_id})-[:HAS_BOX]->(b:FormBox)
+            WHERE c.retracted_at IS NULL AND b.retracted_at IS NULL
+            RETURN b
+            """
+
+            box_results = await neo4j_client.run_query(  # pyright: ignore[reportOptionalMemberAccess]
+                form_boxes_query, {"calculation_id": calculation_id}
+            )
+
+            form_boxes = {}
+            for box_result in box_results:
+                box = box_result["b"]
+                form_boxes[box["box"]] = {
+                    "value": box["value"],
+                    "description": box.get("description"),
+                    "confidence": box.get("confidence"),
+                }
+
+            return {
+                "calculation_id": calculation_id,
+                "schedule": calculation.get("schedule"),
+                "tax_year": calculation.get("tax_year"),
+                "status": calculation.get("status", "completed"),
+                "form_boxes": form_boxes,
+                "calculated_at": calculation.get("calculated_at"),
+            }
+
+        except HTTPException:
+            raise
+        except Exception as e:
+            logger.error(
+                "Failed to get calculation results",
+                calculation_id=calculation_id,
+                error=str(e),
+            )
+            raise HTTPException(
+                status_code=500, detail="Failed to get calculation results"
+            )
+
+
+async def _handle_kg_upserted(topic: str, payload: EventPayload) -> None:
+    """Handle KG upsert events for auto-calculation"""
+    try:
+        data = payload.data
+        entities = data.get("entities", [])
+        tenant_id = data.get("tenant_id")
+
+        # Check if we have enough data for calculation
+        has_income = any(e.get("type") == "IncomeItem" for e in entities)
+        has_expenses = any(e.get("type") == "ExpenseItem" for e in entities)
+
+        if has_income or has_expenses:
+            logger.info(
+                "Auto-triggering calculation due to new financial data",
+                tenant_id=tenant_id,
+            )
+
+            # Find taxpayer ID from entities
+            taxpayer_id = None
+            for entity in entities:
+                if entity.get("type") == "TaxpayerProfile":
+                    taxpayer_id = entity.get("id")
+                    break
+
+            if taxpayer_id:
+                await _compute_schedule_async(
+                    tax_year=settings.current_tax_year,
+                    taxpayer_id=taxpayer_id,
+                    schedule_id="SA103",  # Default to self-employment
+                    tenant_id=tenant_id or "",
+                    calculation_id=str(ulid.new()),
+                    actor=payload.actor,
+                )
+
+    except Exception as e:
+        logger.error("Failed to handle KG upsert for auto-calculation", error=str(e))
+
+
+async def _compute_schedule_async(
+    tax_year: str,
+    taxpayer_id: str,
+    schedule_id: str,
+    tenant_id: str,
+    calculation_id: str,
+    actor: str,
+) -> None:
+    """Compute schedule asynchronously"""
+
+    with tracer.start_as_current_span("compute_schedule_async") as span:
+        span.set_attribute("calculation_id", calculation_id)
+        span.set_attribute("schedule_id", schedule_id)
+        span.set_attribute("tax_year", tax_year)
+
+        try:
+            # Get relevant data from knowledge graph
+            financial_data = await _get_financial_data(taxpayer_id, tax_year, tenant_id)
+
+            # Perform calculations based on schedule
+            if schedule_id == "SA103":
+                form_boxes, evidence_trail = await _compute_sa103(
+                    financial_data, tax_year
+                )
+            elif schedule_id == "SA105":
+                form_boxes, evidence_trail = await _compute_sa105(
+                    financial_data, tax_year
+                )
+            elif schedule_id == "SA100":
+                form_boxes, evidence_trail = await _compute_sa100(
+                    financial_data, tax_year
+                )
+            else:
+                raise ValueError(f"Unsupported schedule: {schedule_id}")
+
+            # Store calculation in knowledge graph
+            await _store_calculation(
+                calculation_id,
+                schedule_id,
+                tax_year,
+                taxpayer_id,
+                form_boxes,
+                evidence_trail,
+                tenant_id,
+            )
+
+            # Update metrics
+            metrics.counter("calculations_completed_total").labels(
+                tenant_id=tenant_id, schedule=schedule_id, tax_year=tax_year
+            ).inc()
+
+            # Publish completion event
+            event_payload = EventPayload(
+                data={
+                    "calculation_id": calculation_id,
+                    "schedule": schedule_id,
+                    "tax_year": tax_year,
+                    "taxpayer_id": taxpayer_id,
+                    "tenant_id": tenant_id,
+                    "form_boxes": form_boxes,
+                    "box_count": len(form_boxes),
+                },
+                actor=actor,
+                tenant_id=tenant_id,
+            )
+
+            await event_bus.publish(EventTopics.CALC_SCHEDULE_READY, event_payload)  # type: ignore
+
+            logger.info(
+                "Schedule computation completed",
+                calculation_id=calculation_id,
+                schedule=schedule_id,
+                boxes=len(form_boxes),
+            )
+
+        except Exception as e:
+            logger.error(
+                "Schedule computation failed",
+                calculation_id=calculation_id,
+                error=str(e),
+            )
+
+            # Update error metrics
+            metrics.counter("calculation_errors_total").labels(
+                tenant_id=tenant_id, schedule=schedule_id, error_type=type(e).__name__
+            ).inc()
+
+
+async def _get_financial_data(
+    taxpayer_id: str, tax_year: str, tenant_id: str
+) -> dict[str, Any]:
+    """Get financial data from knowledge graph"""
+
+    # Get income items
+    income_query = """
+    MATCH (t:TaxpayerProfile {taxpayer_id: $taxpayer_id, tenant_id: $tenant_id})-[:HAS_INCOME]->(i:IncomeItem)
+    WHERE i.retracted_at IS NULL
+    AND i.tax_year = $tax_year
+    RETURN i
+    """
+
+    income_results = (
+        await neo4j_client.run_query(  # pyright: ignore[reportOptionalMemberAccess]
+            income_query,
+            {"taxpayer_id": taxpayer_id, "tax_year": tax_year, "tenant_id": tenant_id},
+        )
+    )
+
+    # Get expense items
+    expense_query = """
+    MATCH (t:TaxpayerProfile {taxpayer_id: $taxpayer_id, tenant_id: $tenant_id})-[:HAS_EXPENSE]->(e:ExpenseItem)
+    WHERE e.retracted_at IS NULL
+    AND e.tax_year = $tax_year
+    RETURN e
+    """
+
+    expense_results = (
+        await neo4j_client.run_query(  # pyright: ignore[reportOptionalMemberAccess]
+            expense_query,
+            {"taxpayer_id": taxpayer_id, "tax_year": tax_year, "tenant_id": tenant_id},
+        )
+    )
+
+    return {
+        "income_items": [result["i"] for result in income_results],
+        "expense_items": [result["e"] for result in expense_results],
+        "tax_year": tax_year,
+        "taxpayer_id": taxpayer_id,
+    }
+
+
+async def _compute_sa103(
+    financial_data: dict[str, Any], tax_year: str
+) -> tuple[dict[str, Any], list[dict[str, Any]]]:
+    """Compute SA103 (Self-employment) schedule"""
+
+    income_items = financial_data.get("income_items", [])
+    expense_items = financial_data.get("expense_items", [])
+
+    # Calculate totals
+    total_turnover = Decimal("0")
+    total_expenses = Decimal("0")
+
+    evidence_trail = []
+
+    # Sum income
+    for income in income_items:
+        if income.get("type") == "self_employment":
+            amount = Decimal(str(income.get("gross", 0)))
+            total_turnover += amount
+
+            evidence_trail.append(
+                {
+                    "box": "20",
+                    "source_entity": income.get("income_id"),
+                    "amount": float(amount),
+                    "description": f"Income: {income.get('description', 'Unknown')}",
+                }
+            )
+
+    # Sum expenses
+    for expense in expense_items:
+        if expense.get("allowable", True):
+            amount = Decimal(str(expense.get("amount", 0)))
+            total_expenses += amount
+
+            evidence_trail.append(
+                {
+                    "box": "31",
+                    "source_entity": expense.get("expense_id"),
+                    "amount": float(amount),
+                    "description": f"Expense: {expense.get('description', 'Unknown')}",
+                }
+            )
+
+    # Calculate net profit
+    net_profit = total_turnover - total_expenses
+
+    # Create form boxes
+    form_boxes = {
+        "20": {
+            "value": float(total_turnover),
+            "description": "Total turnover",
+            "confidence": 0.9,
+        },
+        "31": {
+            "value": float(total_expenses),
+            "description": "Total allowable business expenses",
+            "confidence": 0.9,
+        },
+        "32": {
+            "value": float(net_profit),
+            "description": "Net profit",
+            "confidence": 0.9,
+        },
+    }
+
+    return form_boxes, evidence_trail
+
+
+async def _compute_sa105(
+    financial_data: dict[str, Any], tax_year: str
+) -> tuple[dict[str, Any], list[dict[str, Any]]]:
+    """Compute SA105 (Property income) schedule"""
+
+    income_items = financial_data.get("income_items", [])
+    expense_items = financial_data.get("expense_items", [])
+
+    # Calculate property income and expenses
+    total_rents = Decimal("0")
+    total_property_expenses = Decimal("0")
+
+    evidence_trail = []
+
+    # Sum property income
+    for income in income_items:
+        if income.get("type") == "property":
+            amount = Decimal(str(income.get("gross", 0)))
+            total_rents += amount
+
+            evidence_trail.append(
+                {
+                    "box": "20",
+                    "source_entity": income.get("income_id"),
+                    "amount": float(amount),
+                    "description": f"Property income: {income.get('description', 'Unknown')}",
+                }
+            )
+
+    # Sum property expenses
+    for expense in expense_items:
+        if expense.get("type") == "property" and expense.get("allowable", True):
+            amount = Decimal(str(expense.get("amount", 0)))
+            total_property_expenses += amount
+
+            # Map to appropriate SA105 box based on expense category
+            box = _map_property_expense_to_box(expense.get("category", "other"))
+
+            evidence_trail.append(
+                {
+                    "box": box,
+                    "source_entity": expense.get("expense_id"),
+                    "amount": float(amount),
+                    "description": f"Property expense: {expense.get('description', 'Unknown')}",
+                }
+            )
+
+    # Calculate net property income
+    net_property_income = total_rents - total_property_expenses
+
+    form_boxes = {
+        "20": {
+            "value": float(total_rents),
+            "description": "Total rents and other income",
+            "confidence": 0.9,
+        },
+        "38": {
+            "value": float(total_property_expenses),
+            "description": "Total property expenses",
+            "confidence": 0.9,
+        },
+        "net_income": {
+            "value": float(net_property_income),
+            "description": "Net property income",
+            "confidence": 0.9,
+        },
+    }
+
+    return form_boxes, evidence_trail
+
+
+async def _compute_sa100(
+    financial_data: dict[str, Any], tax_year: str
+) -> tuple[dict[str, Any], list[dict[str, Any]]]:
+    """Compute SA100 (Main return) schedule"""
+
+    # This would aggregate from other schedules
+    # For now, return basic structure
+    form_boxes = {
+        "1": {"value": "John Doe", "description": "Your name", "confidence": 0.9}
+    }
+
+    evidence_trail: list[dict[str, Any]] = []
+
+    return form_boxes, evidence_trail
+
+
+def _map_property_expense_to_box(category: str) -> str:
+    """Map property expense category to SA105 box"""
+    mapping = {
+        "rent_rates_insurance": "31",
+        "property_management": "32",
+        "services_wages": "33",
+        "repairs_maintenance": "34",
+        "finance_costs": "35",
+        "professional_fees": "36",
+        "costs_of_services": "37",
+        "other": "38",
+    }
+
+    return mapping.get(category, "38")
+
+
+async def _store_calculation(
+    calculation_id: str,
+    schedule: str,
+    tax_year: str,
+    taxpayer_id: str,
+    form_boxes: dict[str, Any],
+    evidence_trail: list[dict[str, Any]],
+    tenant_id: str,
+) -> None:
+    """Store calculation results in knowledge graph"""
+
+    # Create calculation node
+    calc_properties = {
+        "calculation_id": calculation_id,
+        "schedule": schedule,
+        "tax_year": tax_year,
+        "taxpayer_id": taxpayer_id,
+        "tenant_id": tenant_id,
+        "calculated_at": datetime.utcnow().isoformat(),
+        "status": "completed",
+        "source": "reasoning_engine",
+        "extractor_version": "1.0.0",
+        "valid_from": datetime.utcnow(),
+        "asserted_at": datetime.utcnow(),
+    }
+
+    await neo4j_client.create_node("Calculation", calc_properties) # fmt: skip # pyright: ignore[reportOptionalMemberAccess]
+
+    # Create form box nodes
+    for box_id, box_data in form_boxes.items():
+        box_properties = {
+            "form": schedule,
+            "box": box_id,
+            "value": box_data["value"],
+            "description": box_data.get("description"),
+            "confidence": box_data.get("confidence"),
+            "calculation_id": calculation_id,
+            "tenant_id": tenant_id,
+            "source": "reasoning_engine",
+            "extractor_version": "1.0.0",
+            "valid_from": datetime.utcnow(),
+            "asserted_at": datetime.utcnow(),
+        }
+
+        await neo4j_client.create_node("FormBox", box_properties) # fmt: skip # pyright: ignore[reportOptionalMemberAccess]
+
+        # Create relationship
+        await neo4j_client.create_relationship(  # pyright: ignore[reportOptionalMemberAccess]
+            "Calculation",
+            calculation_id,
+            "FormBox",
+            f"{calculation_id}_{box_id}",
+            "HAS_BOX",
+        )
+
+
+@app.exception_handler(HTTPException)
+async def http_exception_handler(request: Request, exc: HTTPException) -> JSONResponse:
+    """Handle HTTP exceptions with RFC7807 format"""
+    return JSONResponse(
+        status_code=exc.status_code,
+        content=ErrorResponse(
+            type=f"https://httpstatuses.com/{exc.status_code}",
+            title=exc.detail,
+            status=exc.status_code,
+            detail=exc.detail,
+            instance=str(request.url),
+            trace_id=getattr(request.state, "trace_id", None),
+        ).model_dump(),
+    )
+
+
+if __name__ == "__main__":
+    import uvicorn
+
+    uvicorn.run("main:app", host="0.0.0.0", port=8008, reload=True, log_config=None)
--- a/apps/svc_reason/requirements.txt
+++ b/apps/svc_reason/requirements.txt
@@ -0,0 +1,35 @@
+# FastAPI and server
+fastapi>=0.104.1
+uvicorn[standard]>=0.24.0
+pydantic>=2.5.0
+
+# Service-specific dependencies
+# Mathematical calculations
+# decimal is part of Python standard library
+sympy>=1.12.0
+
+# Tax calculations
+numpy>=2.3.3
+pandas>=2.1.0
+
+# Date and time calculations
+python-dateutil>=2.8.0
+pytz>=2023.3
+
+# UK tax specific
+# uk-tax-calculator>=1.0.0  # Package may not exist, commenting out
+
+# Business rules engine
+# python-rules>=1.3.0  # Package may not exist, commenting out
+
+# Financial calculations
+# quantlib>=1.32.0  # Package may not exist, commenting out
+
+# Data validation
+cerberus>=1.3.4
+
+# Template processing for explanations
+jinja2>=3.1.0
+
+# Statistical calculations
+scipy>=1.11.0
--- a/apps/svc_rpa/Dockerfile
+++ b/apps/svc_rpa/Dockerfile
@@ -0,0 +1,53 @@
+# Multi-stage build for svc_rpa
+FROM python:3.12-slim AS builder
+
+# Install build dependencies
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    curl \
+    && rm -rf /var/lib/apt/lists/*
+
+# Create virtual environment
+RUN python -m venv /opt/venv
+ENV PATH="/opt/venv/bin:$PATH"
+
+# Copy requirements and install dependencies
+COPY libs/requirements-base.txt /tmp/libs-requirements.txt
+COPY apps/svc_rpa/requirements.txt /tmp/requirements.txt
+RUN pip install --no-cache-dir --upgrade pip && \
+    pip install --no-cache-dir -r /tmp/libs-requirements.txt -r /tmp/requirements.txt
+
+# Production stage
+FROM python:3.12-slim
+
+# Install runtime dependencies
+RUN apt-get update && apt-get install -y \
+    curl \
+    && rm -rf /var/lib/apt/lists/* \
+    && groupadd -r appuser \
+    && useradd -r -g appuser appuser
+
+# Copy virtual environment from builder
+COPY --from=builder /opt/venv /opt/venv
+ENV PATH="/opt/venv/bin:$PATH"
+
+# Set working directory
+WORKDIR /app
+
+# Copy application code
+COPY libs/ ./libs/
+COPY apps/svc_rpa/ ./apps/svc_rpa/
+
+# Create non-root user and set permissions
+RUN chown -R appuser:appuser /app
+USER appuser
+
+# Health check
+HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
+    CMD curl -f http://localhost:8000/healthz || exit 1
+
+# Expose port
+EXPOSE 8000
+
+# Run the application
+CMD ["python", "-m", "uvicorn", "apps.svc_rpa.main:app", "--host", "0.0.0.0", "--port", "8000"]
--- a/apps/svc_rpa/main.py
+++ b/apps/svc_rpa/main.py
@@ -0,0 +1,524 @@
+# FILE: apps/svc-rpa/main.py
+# mypy: disable-error-code=union-attr
+# Playwright automation for portal data extraction (HMRC, banks, etc.)
+
+import asyncio
+import os
+
+# Import shared libraries
+import sys
+from datetime import datetime
+from typing import Any
+
+import structlog
+import ulid
+from fastapi import BackgroundTasks, Depends, HTTPException, Request
+from fastapi.responses import JSONResponse
+from playwright.async_api import Browser, Page, async_playwright
+
+sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
+
+from libs.app_factory import create_app
+from libs.config import BaseAppSettings, create_event_bus, create_vault_client
+from libs.events import EventBus, EventPayload
+from libs.observability import get_metrics, get_tracer, setup_observability
+from libs.schemas import ErrorResponse
+from libs.security import VaultTransitHelper, get_current_user, get_tenant_id
+
+logger = structlog.get_logger()
+
+
+class RPASettings(BaseAppSettings):
+    """Settings for RPA service"""
+
+    service_name: str = "svc-rpa"
+
+    # Browser configuration
+    browser_type: str = "chromium"  # chromium, firefox, webkit
+    headless: bool = True
+    timeout: int = 30000  # 30 seconds
+
+    # Portal configurations
+    hmrc_base_url: str = "https://www.gov.uk/log-in-hmrc-online-services"
+    open_banking_enabled: bool = False
+
+    # Security
+    max_concurrent_sessions: int = 5
+    session_timeout: int = 300  # 5 minutes
+
+
+# Create app and settings
+app, settings = create_app(
+    service_name="svc-rpa",
+    title="Tax Agent RPA Service",
+    description="Robotic Process Automation for portal data extraction",
+    settings_class=RPASettings,
+)
+
+# Global clients
+vault_helper: VaultTransitHelper | None = None
+event_bus: EventBus | None = None
+browser: Browser | None = None
+active_sessions: dict[str, dict[str, Any]] = {}
+tracer = get_tracer("svc-rpa")
+metrics = get_metrics()
+
+
+@app.on_event("startup")
+async def startup_event() -> None:
+    """Initialize service dependencies"""
+    global vault_helper, event_bus, browser
+
+    logger.info("Starting RPA service")
+
+    # Setup observability
+    setup_observability(settings)
+
+    # Initialize Vault helper
+    vault_client = create_vault_client(settings)
+    vault_helper = VaultTransitHelper(vault_client, "tax-agent-transit")
+
+    # Initialize event bus
+    event_bus = create_event_bus(settings)
+    await event_bus.start() # fmt: skip # pyright: ignore[reportOptionalMemberAccess]
+
+    # Initialize browser
+    playwright = await async_playwright().start()
+    browser = await playwright[settings.browser_type].launch(
+        headless=settings.headless,
+        args=["--no-sandbox", "--disable-dev-shm-usage"] if settings.headless else [],
+    )
+
+    logger.info("RPA service started successfully")
+
+
+@app.on_event("shutdown")
+async def shutdown_event() -> None:
+    """Cleanup service dependencies"""
+    global event_bus, browser
+
+    logger.info("Shutting down RPA service")
+
+    if browser:
+        await browser.close()
+
+    if event_bus:
+        await event_bus.stop()
+
+    logger.info("RPA service shutdown complete")
+
+
+@app.get("/health")
+async def health_check() -> dict[str, Any]:
+    """Health check endpoint"""
+    return {
+        "status": "healthy",
+        "service": settings.service_name,
+        "version": settings.service_version,
+        "timestamp": datetime.utcnow().isoformat(),
+        "active_sessions": len(active_sessions),
+    }
+
+
+@app.post("/sessions")
+async def create_session(
+    portal: str,
+    background_tasks: BackgroundTasks,
+    current_user: dict[str, Any] = Depends(get_current_user),
+    tenant_id: str = Depends(get_tenant_id),
+) -> dict[str, Any]:
+    """Create new RPA session"""
+
+    with tracer.start_as_current_span("create_session") as span:
+        span.set_attribute("portal", portal)
+        span.set_attribute("tenant_id", tenant_id)
+
+        try:
+            # Check session limits
+            if len(active_sessions) >= settings.max_concurrent_sessions:
+                raise HTTPException(status_code=429, detail="Too many active sessions")
+
+            # Generate session ID
+            session_id = str(ulid.new())
+            span.set_attribute("session_id", session_id)
+
+            # Create browser context
+            context = await browser.new_context(  # pyright: ignore[reportOptionalMemberAccess]
+                viewport={"width": 1920, "height": 1080},
+                user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
+            )
+
+            page = await context.new_page()
+
+            # Store session
+            active_sessions[session_id] = {
+                "context": context,
+                "page": page,
+                "portal": portal,
+                "tenant_id": tenant_id,
+                "user_id": current_user.get("sub"),
+                "created_at": datetime.utcnow(),
+                "last_activity": datetime.utcnow(),
+            }
+
+            # Schedule session cleanup
+            background_tasks.add_task(
+                _cleanup_session_after_timeout, session_id, settings.session_timeout
+            )
+
+            logger.info("RPA session created", session_id=session_id, portal=portal)
+
+            return {
+                "session_id": session_id,
+                "portal": portal,
+                "status": "created",
+                "expires_at": (
+                    datetime.utcnow().timestamp() + settings.session_timeout
+                ),
+            }
+
+        except Exception as e:
+            logger.error("Failed to create session", error=str(e))
+            raise HTTPException(status_code=500, detail="Failed to create session")
+
+
+@app.post("/sessions/{session_id}/navigate")
+async def navigate_to_url(
+    session_id: str,
+    url: str,
+    current_user: dict[str, Any] = Depends(get_current_user),
+    tenant_id: str = Depends(get_tenant_id),
+) -> dict[str, Any]:
+    """Navigate to URL in session"""
+
+    with tracer.start_as_current_span("navigate") as span:
+        span.set_attribute("session_id", session_id)
+        span.set_attribute("url", url)
+
+        try:
+            session = _get_session(session_id, tenant_id)
+            page = session["page"]
+
+            # Navigate to URL
+            response = await page.goto(url, timeout=settings.timeout)
+
+            # Update last activity
+            session["last_activity"] = datetime.utcnow()
+
+            # Take screenshot for debugging
+            await page.screenshot()
+
+            logger.info(
+                "Navigated to URL",
+                session_id=session_id,
+                url=url,
+                status=response.status,
+            )
+
+            return {
+                "status": "success",
+                "url": page.url,
+                "title": await page.title(),
+                "response_status": response.status,
+            }
+
+        except Exception as e:
+            logger.error(
+                "Navigation failed", session_id=session_id, url=url, error=str(e)
+            )
+            raise HTTPException(status_code=500, detail=f"Navigation failed: {str(e)}")
+
+
+@app.post("/sessions/{session_id}/login")
+async def login_to_portal(
+    session_id: str,
+    credentials: dict[str, str],
+    current_user: dict[str, Any] = Depends(get_current_user),
+    tenant_id: str = Depends(get_tenant_id),
+) -> dict[str, Any]:
+    """Login to portal using encrypted credentials"""
+
+    with tracer.start_as_current_span("login") as span:
+        span.set_attribute("session_id", session_id)
+
+        try:
+            session = _get_session(session_id, tenant_id)
+            page = session["page"]
+            portal = session["portal"]
+
+            # Decrypt credentials
+            decrypted_credentials: dict[str, Any] = {}
+            for key, encrypted_value in credentials.items():
+                decrypted_credentials[key] = (
+                    vault_helper.decrypt_field(  # pyright: ignore[reportOptionalMemberAccess]
+                        key_name=key, ciphertext=encrypted_value
+                    )
+                )
+
+            # Perform login based on portal type
+            if portal == "hmrc":
+                success = await _login_hmrc(page, decrypted_credentials)
+            elif portal == "open_banking":
+                success = await _login_open_banking(page, decrypted_credentials)
+            else:
+                raise ValueError(f"Unsupported portal: {portal}")
+
+            # Update session
+            session["last_activity"] = datetime.utcnow()
+            session["authenticated"] = success
+
+            if success:
+                logger.info("Login successful", session_id=session_id, portal=portal)
+                return {"status": "success", "authenticated": True}
+            else:
+                logger.warning("Login failed", session_id=session_id, portal=portal)
+                return {"status": "failed", "authenticated": False}
+
+        except Exception as e:
+            logger.error("Login error", session_id=session_id, error=str(e))
+            raise HTTPException(status_code=500, detail=f"Login failed: {str(e)}")
+
+
+@app.post("/sessions/{session_id}/extract")
+async def extract_data(
+    session_id: str,
+    extraction_config: dict[str, Any],
+    current_user: dict[str, Any] = Depends(get_current_user),
+    tenant_id: str = Depends(get_tenant_id),
+) -> dict[str, Any]:
+    """Extract data from portal"""
+
+    with tracer.start_as_current_span("extract_data") as span:
+        span.set_attribute("session_id", session_id)
+
+        try:
+            session = _get_session(session_id, tenant_id)
+            page = session["page"]
+            portal = session["portal"]
+
+            # Check authentication
+            if not session.get("authenticated", False):
+                raise HTTPException(status_code=401, detail="Session not authenticated")
+
+            # Extract data based on portal and config
+            if portal == "hmrc":
+                extracted_data = await _extract_hmrc_data(page, extraction_config)
+            elif portal == "open_banking":
+                extracted_data = await _extract_banking_data(page, extraction_config)
+            else:
+                raise ValueError(f"Unsupported portal: {portal}")
+
+            # Update session
+            session["last_activity"] = datetime.utcnow()
+
+            # Publish extraction event
+            event_payload = EventPayload(
+                data={
+                    "session_id": session_id,
+                    "portal": portal,
+                    "extraction_config": extraction_config,
+                    "extracted_data": extracted_data,
+                    "tenant_id": tenant_id,
+                },
+                actor=current_user.get("sub", "system"),
+                tenant_id=tenant_id,
+                trace_id=span.get_span_context().trace_id,
+            )
+
+            await event_bus.publish("rpa.data_extracted", event_payload) # fmt: skip # pyright: ignore[reportOptionalMemberAccess]
+
+            logger.info(
+                "Data extracted",
+                session_id=session_id,
+                portal=portal,
+                records_count=len(extracted_data.get("records", [])),
+            )
+
+            return {
+                "status": "success",
+                "extracted_data": extracted_data,
+                "records_count": len(extracted_data.get("records", [])),
+            }
+
+        except Exception as e:
+            logger.error("Data extraction failed", session_id=session_id, error=str(e))
+            raise HTTPException(status_code=500, detail=f"Extraction failed: {str(e)}")
+
+
+@app.delete("/sessions/{session_id}")
+async def close_session(
+    session_id: str,
+    current_user: dict[str, Any] = Depends(get_current_user),
+    tenant_id: str = Depends(get_tenant_id),
+) -> dict[str, str]:
+    """Close RPA session"""
+
+    with tracer.start_as_current_span("close_session") as span:
+        span.set_attribute("session_id", session_id)
+
+        try:
+            session = _get_session(session_id, tenant_id)
+
+            # Close browser context
+            await session["context"].close()
+
+            # Remove from active sessions
+            del active_sessions[session_id]
+
+            logger.info("Session closed", session_id=session_id)
+
+            return {"status": "closed"}
+
+        except Exception as e:
+            logger.error("Failed to close session", session_id=session_id, error=str(e))
+            raise HTTPException(status_code=500, detail="Failed to close session")
+
+
+def _get_session(session_id: str, tenant_id: str) -> dict[str, Any]:
+    """Get and validate session"""
+    if session_id not in active_sessions:
+        raise HTTPException(status_code=404, detail="Session not found")
+
+    session = active_sessions[session_id]
+
+    # Check tenant access
+    if session["tenant_id"] != tenant_id:
+        raise HTTPException(status_code=403, detail="Access denied")
+
+    # Check timeout
+    if (
+        datetime.utcnow() - session["last_activity"]
+    ).seconds > settings.session_timeout:
+        raise HTTPException(status_code=408, detail="Session expired")
+
+    return session
+
+
+async def _login_hmrc(page: Page, credentials: dict[str, str]) -> bool:
+    """Login to HMRC portal"""
+    try:
+        # Navigate to HMRC login
+        await page.goto(settings.hmrc_base_url)
+
+        # Wait for login form
+        await page.wait_for_selector('input[name="userId"]', timeout=settings.timeout)
+
+        # Fill credentials
+        await page.fill('input[name="userId"]', credentials.get("user_id", ""))
+        await page.fill('input[name="password"]', credentials.get("password", ""))
+
+        # Submit form
+        await page.click('button[type="submit"]')
+
+        # Wait for redirect or error
+        await page.wait_for_load_state("networkidle")
+
+        # Check if login was successful
+        current_url = page.url
+        return "sign-in" not in current_url.lower()
+
+    except Exception as e:
+        logger.error("HMRC login failed", error=str(e))
+        return False
+
+
+async def _login_open_banking(page: Page, credentials: dict[str, str]) -> bool:
+    """Login to Open Banking portal"""
+    try:
+        # This would implement Open Banking login flow
+        # For now, return False as it's not implemented
+        logger.warning("Open Banking login not implemented")
+        return False
+
+    except Exception as e:
+        logger.error("Open Banking login failed", error=str(e))
+        return False
+
+
+async def _extract_hmrc_data(page: Page, config: dict[str, Any]) -> dict[str, Any]:
+    """Extract data from HMRC portal"""
+    try:
+        data_type = config.get("data_type", "tax_returns")
+        tax_year = config.get("tax_year", "2023-24")
+
+        extracted_data = {
+            "data_type": data_type,
+            "tax_year": tax_year,
+            "records": [],
+            "extracted_at": datetime.utcnow().isoformat(),
+        }
+
+        if data_type == "tax_returns":
+            # Navigate to tax returns section
+            await page.click('a[href*="tax-return"]')
+            await page.wait_for_load_state("networkidle")
+
+            # Extract return data
+            returns = await page.query_selector_all(".tax-return-item")
+            for return_element in returns:
+                return_data = await return_element.evaluate(
+                    """
+                    element => ({
+                        year: element.querySelector('.tax-year')?.textContent?.trim(),
+                        status: element.querySelector('.status')?.textContent?.trim(),
+                        amount: element.querySelector('.amount')?.textContent?.trim()
+                    })
+                """
+                )
+                extracted_data["records"].append(return_data)
+
+        return extracted_data
+
+    except Exception as e:
+        logger.error("HMRC data extraction failed", error=str(e))
+        return {"error": str(e), "records": []}
+
+
+async def _extract_banking_data(page: Page, config: dict[str, Any]) -> dict[str, Any]:
+    """Extract banking data via Open Banking"""
+    try:
+        # This would implement Open Banking data extraction
+        logger.warning("Open Banking extraction not implemented")
+        return {"error": "Not implemented", "records": []}
+
+    except Exception as e:
+        logger.error("Banking data extraction failed", error=str(e))
+        return {"error": str(e), "records": []}
+
+
+async def _cleanup_session_after_timeout(session_id: str, timeout_seconds: int) -> None:
+    """Cleanup session after timeout"""
+    await asyncio.sleep(timeout_seconds)
+
+    if session_id in active_sessions:
+        try:
+            session = active_sessions[session_id]
+            await session["context"].close()
+            del active_sessions[session_id]
+            logger.info("Session cleaned up due to timeout", session_id=session_id)
+        except Exception as e:
+            logger.error(
+                "Failed to cleanup session", session_id=session_id, error=str(e)
+            )
+
+
+@app.exception_handler(HTTPException)
+async def http_exception_handler(request: Request, exc: HTTPException) -> JSONResponse:
+    """Handle HTTP exceptions with RFC7807 format"""
+    return JSONResponse(
+        status_code=exc.status_code,
+        content=ErrorResponse(
+            type=f"https://httpstatuses.com/{exc.status_code}",
+            title=exc.detail,
+            status=exc.status_code,
+            detail=exc.detail,
+            instance=str(request.url),
+            trace_id="",
+        ).model_dump(),
+    )
+
+
+if __name__ == "__main__":
+    import uvicorn
+
+    uvicorn.run("main:app", host="0.0.0.0", port=8001, reload=True, log_config=None)
--- a/apps/svc_rpa/requirements.txt
+++ b/apps/svc_rpa/requirements.txt
@@ -0,0 +1,17 @@
+# FastAPI and server
+fastapi>=0.104.1
+uvicorn[standard]>=0.24.0
+pydantic>=2.5.0
+
+# Service-specific dependencies
+# Browser automation
+playwright>=1.40.0
+
+# Additional async utilities
+# asyncio-timeout>=4.0.3  # Deprecated, use asyncio.timeout from Python 3.11+ standard library
+
+# Session management
+aioredis>=2.0.1
+
+# Browser management
+psutil>=5.9.0