Initial commit

2025-10-11 08:41:36 +01:00
commit b324ff09ef
276 changed files with 55220 additions and 0 deletions
--- a/tests/e2e/test_coverage_to_compute_flow.py
+++ b/tests/e2e/test_coverage_to_compute_flow.py
@@ -0,0 +1,472 @@
+"""End-to-end test for coverage to compute flow integration."""
+
+# FILE: tests/e2e/test_coverage_to_compute_flow.py
+
+import os
+import sys
+from unittest.mock import patch
+
+import pytest
+from fastapi.testclient import TestClient
+
+sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
+sys.path.append(
+    os.path.join(os.path.dirname(__file__), "..", "..", "apps", "svc-coverage")
+)
+
+from libs.schemas import OverallStatus, Role, Status
+
+
+def create_test_app():
+    """Create a test FastAPI app without problematic startup events"""
+    from fastapi import FastAPI
+
+    from libs.config import BaseAppSettings
+    from libs.security import TrustedProxyMiddleware
+
+    # Create minimal settings
+    class TestSettings(BaseAppSettings):
+        service_name: str = "test-coverage"
+        internal_cidrs: list[str] = ["127.0.0.1/32"]
+
+    settings = TestSettings()
+
+    # Create test app
+    test_app = FastAPI(
+        title="Test Coverage Service",
+        description="Test coverage service",
+        version="1.0.0",
+    )
+
+    # Add middleware
+    test_app.add_middleware(
+        TrustedProxyMiddleware, internal_cidrs=settings.internal_cidrs
+    )
+
+    # Import and add routes from main app
+    from main import app as main_app
+
+    test_app.router = main_app.router
+
+    return test_app
+
+
+class TestCoverageToComputeFlow:
+    """Test end-to-end flow from coverage checking to compute triggering"""
+
+    @pytest.fixture
+    def client(self):
+        """Create test client"""
+        test_app = create_test_app()
+        return TestClient(test_app)
+
+    @pytest.fixture
+    def mock_dependencies(self):
+        """Mock all external dependencies"""
+        with (
+            patch("sys.modules") as mock_modules,
+            patch("libs.policy.loader.PolicyLoader") as mock_loader,
+            patch("libs.neo.client.Neo4jClient") as mock_kg,
+            patch("libs.rag.retriever.RAGRetriever") as mock_rag,
+            patch("sqlalchemy.orm.Session") as mock_db,
+            patch("libs.config.create_neo4j_client") as mock_create_neo4j,
+            patch("libs.config.create_event_bus") as mock_create_event_bus,
+            patch("libs.policy.get_policy_loader") as mock_get_policy_loader,
+        ):
+
+            # Mock policy loader
+            from unittest.mock import Mock
+
+            mock_policy = Mock()
+            mock_policy.policy.version = "1.0"
+            mock_policy.policy.jurisdiction = "UK"
+            mock_policy.policy.tax_year = "2024-25"
+            mock_loader.return_value.load_policy.return_value = mock_policy
+            mock_loader.return_value.compile_predicates.return_value = mock_policy
+
+            # Mock KG client
+            mock_kg_client = Mock()
+            mock_kg.return_value = mock_kg_client
+
+            # Mock RAG client
+            mock_rag_client = Mock()
+            mock_rag.return_value = mock_rag_client
+
+            # Mock database session
+            mock_session = Mock()
+            mock_db.return_value = mock_session
+
+            # Mock the factory functions that are called during startup
+            mock_create_neo4j.return_value = Mock()
+            mock_create_event_bus.return_value = Mock()
+            mock_get_policy_loader.return_value = mock_loader.return_value
+
+            yield {
+                "policy_loader": mock_loader.return_value,
+                "kg_client": mock_kg_client,
+                "rag_client": mock_rag_client,
+                "db_session": mock_session,
+                "policy": mock_policy,
+            }
+
+    def test_complete_coverage_flow(self, client, mock_dependencies):
+        """Test complete flow when all evidence is present"""
+        # Mock coverage evaluator to return complete coverage
+        with patch("apps.svc_coverage.main.CoverageEvaluator") as mock_evaluator_class:
+            from unittest.mock import Mock
+
+            mock_evaluator = Mock()
+            mock_evaluator_class.return_value = mock_evaluator
+
+            # Mock complete coverage report
+            mock_report = Mock()
+            mock_report.overall_status = OverallStatus.OK
+            mock_report.schedules_required = ["SA102"]
+            mock_report.blocking_items = []
+
+            # Mock coverage details
+            mock_evidence = Mock()
+            mock_evidence.id = "P60"
+            mock_evidence.status = Status.PRESENT_VERIFIED
+            mock_evidence.role = Role.REQUIRED
+            mock_evidence.found = [
+                Mock(
+                    doc_id="DOC-P60-001",
+                    kind="P60",
+                    ocr_confidence=0.95,
+                    extract_confidence=0.92,
+                )
+            ]
+
+            mock_schedule = Mock()
+            mock_schedule.schedule_id = "SA102"
+            mock_schedule.status = OverallStatus.OK
+            mock_schedule.evidence = [mock_evidence]
+
+            mock_report.coverage = [mock_schedule]
+            mock_evaluator.check_document_coverage.return_value = mock_report
+
+            # Call coverage check endpoint
+            response = client.post(
+                "/v1/coverage/check",
+                json={
+                    "taxpayer_id": "T-001",
+                    "tax_year": "2024-25",
+                    "jurisdiction": "UK",
+                },
+            )
+
+            assert response.status_code == 200
+            data = response.json()
+
+            # Verify response structure
+            assert data["overall_status"] == "OK"
+            assert len(data["schedules_required"]) == 1
+            assert "SA102" in data["schedules_required"]
+            assert len(data["coverage"]) == 1
+            assert len(data["blocking_items"]) == 0
+
+            # Verify coverage details
+            sa102_coverage = data["coverage"][0]
+            assert sa102_coverage["schedule_id"] == "SA102"
+            assert sa102_coverage["status"] == "OK"
+            assert len(sa102_coverage["evidence"]) == 1
+
+            p60_evidence = sa102_coverage["evidence"][0]
+            assert p60_evidence["id"] == "P60"
+            assert p60_evidence["status"] == "PRESENT_VERIFIED"
+            assert p60_evidence["role"] == "REQUIRED"
+
+    def test_incomplete_coverage_flow(self, client, mock_dependencies):
+        """Test flow when evidence is missing"""
+        with patch("apps.svc_coverage.main.CoverageEvaluator") as mock_evaluator_class:
+            from unittest.mock import Mock
+
+            mock_evaluator = Mock()
+            mock_evaluator_class.return_value = mock_evaluator
+
+            # Mock incomplete coverage report
+            mock_report = Mock()
+            mock_report.overall_status = OverallStatus.BLOCKING
+            mock_report.schedules_required = ["SA102"]
+
+            # Mock evidence
+            mock_evidence = Mock()
+            mock_evidence.id = "P60"
+            mock_evidence.status = Status.MISSING
+            mock_evidence.role = Role.REQUIRED
+            mock_evidence.found = []
+            mock_evidence.acceptable_alternatives = ["P45", "FinalPayslipYTD"]
+
+            mock_schedule = Mock()
+            mock_schedule.schedule_id = "SA102"
+            mock_schedule.status = OverallStatus.BLOCKING
+            mock_schedule.evidence = [mock_evidence]
+
+            # Mock blocking item (without acceptable_alternatives field per schema)
+            mock_blocking_item = Mock()
+            mock_blocking_item.schedule_id = "SA102"
+            mock_blocking_item.evidence_id = "P60"
+
+            mock_report.coverage = [mock_schedule]
+            mock_report.blocking_items = [mock_blocking_item]
+            mock_evaluator.check_document_coverage.return_value = mock_report
+
+            # Call coverage check endpoint
+            response = client.post(
+                "/v1/coverage/check",
+                json={
+                    "taxpayer_id": "T-001",
+                    "tax_year": "2024-25",
+                    "jurisdiction": "UK",
+                },
+            )
+
+            assert response.status_code == 200
+            data = response.json()
+
+            # Verify incomplete status
+            assert data["overall_status"] == "INCOMPLETE"
+            assert len(data["blocking_items"]) == 1
+
+            # Verify blocking item details
+            blocking_item = data["blocking_items"][0]
+            assert blocking_item["evidence_id"] == "P60"
+            assert blocking_item["schedule_id"] == "SA102"
+
+            # Verify alternatives are in the evidence item, not blocking item
+            sa102_coverage = data["coverage"][0]
+            p60_evidence = sa102_coverage["evidence"][0]
+            assert len(p60_evidence["acceptable_alternatives"]) == 2
+
+    def test_clarification_flow(self, client, mock_dependencies):
+        """Test clarification question generation flow"""
+        with patch("apps.svc_coverage.main.CoverageEvaluator") as mock_evaluator_class:
+            mock_evaluator = AsyncMock()
+            mock_evaluator_class.return_value = mock_evaluator
+
+            # Mock clarification response
+            mock_evaluator.generate_clarifying_question.return_value = AsyncMock(
+                question="To complete the SA102 for 2024-25, we need P60. These documents support boxes SA102_b1, SA102_b2.",
+                why="P60 provides year-end pay and PAYE tax figures required for employment income reporting.",
+                blocking=True,
+                boxes_affected=["SA102_b1", "SA102_b2"],
+                upload_options=[
+                    AsyncMock(
+                        label="Upload P60 (PDF/CSV)",
+                        accepted_formats=["pdf", "csv"],
+                        upload_endpoint="/v1/ingest/upload?tag=P60",
+                    ),
+                    AsyncMock(
+                        label="Upload P45 (PDF/CSV)",
+                        accepted_formats=["pdf", "csv"],
+                        upload_endpoint="/v1/ingest/upload?tag=P45",
+                    ),
+                ],
+                citations=[
+                    AsyncMock(
+                        rule_id="UK.SA102.P60.Required",
+                        doc_id="SA102-Notes-2025",
+                        locator="p.3 §1.1",
+                        url="https://docs.local/SA102-Notes-2025#p3s1.1",
+                    )
+                ],
+            )
+
+            # Call clarification endpoint
+            response = client.post(
+                "/v1/coverage/clarify",
+                json={
+                    "taxpayer_id": "T-001",
+                    "tax_year": "2024-25",
+                    "jurisdiction": "UK",
+                    "schedule_id": "SA102",
+                    "evidence_id": "P60",
+                },
+            )
+
+            assert response.status_code == 200
+            data = response.json()
+
+            # Verify clarification response
+            assert "question" in data
+            assert "why" in data
+            assert data["blocking"] is True
+            assert len(data["boxes_affected"]) == 2
+            assert len(data["upload_options"]) == 2
+            assert len(data["citations"]) == 1
+
+            # Verify upload options
+            upload_option = data["upload_options"][0]
+            assert "Upload P60" in upload_option["label"]
+            assert "pdf" in upload_option["accepted_formats"]
+            assert "/v1/ingest/upload" in upload_option["upload_endpoint"]
+
+    def test_policy_validation_flow(self, client, mock_dependencies):
+        """Test policy validation endpoint"""
+        # Mock policy validation
+        mock_dependencies["policy_loader"].validate_policy.return_value = AsyncMock(
+            ok=True,
+            errors=[],
+        )
+
+        # Call validation endpoint
+        response = client.post(
+            "/v1/coverage/validate",
+            json={
+                "version": "1.0",
+                "jurisdiction": "UK",
+                "tax_year": "2024-25",
+                "tax_year_boundary": {"start": "2024-04-06", "end": "2025-04-05"},
+                "defaults": {"confidence_thresholds": {"ocr": 0.82, "extract": 0.85}},
+                "document_kinds": ["P60"],
+                "triggers": {
+                    "SA102": {"any_of": ["exists(IncomeItem[type='Employment'])"]}
+                },
+                "schedules": {
+                    "SA102": {
+                        "evidence": [
+                            {"id": "P60", "role": "REQUIRED", "boxes": ["SA102_b1"]}
+                        ]
+                    }
+                },
+                "status_classifier": {
+                    "present_verified": {"min_ocr": 0.82},
+                    "present_unverified": {"min_ocr": 0.60},
+                    "conflicting": {"conflict_rules": []},
+                    "missing": {"default": True},
+                },
+                "conflict_resolution": {"precedence": ["P60"]},
+                "question_templates": {"default": {"text": "test", "why": "test"}},
+            },
+        )
+
+        assert response.status_code == 200
+        data = response.json()
+
+        assert data["valid"] is True
+        assert len(data["errors"]) == 0
+
+    def test_policy_reload_flow(self, client, mock_dependencies):
+        """Test policy hot reload flow"""
+        # Mock admin user check
+        with patch("apps.svc_coverage.main.check_admin_permission") as mock_admin:
+            mock_admin.return_value = True
+
+            # Call reload endpoint
+            response = client.post(
+                "/admin/coverage/reload",
+                headers={"Authorization": "Bearer admin-token"},
+            )
+
+            assert response.status_code == 200
+            data = response.json()
+
+            assert data["reloaded"] is True
+            assert "timestamp" in data
+            assert "version" in data
+
+    def test_policy_info_flow(self, client, mock_dependencies):
+        """Test policy information endpoint"""
+        # Call policy info endpoint
+        response = client.get("/v1/coverage/policy")
+
+        assert response.status_code == 200
+        data = response.json()
+
+        assert data["version"] == "1.0"
+        assert data["jurisdiction"] == "UK"
+        assert data["tax_year"] == "2024-25"
+
+    def test_health_check_flow(self, client, mock_dependencies):
+        """Test health check endpoint"""
+        response = client.get("/health")
+
+        assert response.status_code == 200
+        data = response.json()
+
+        assert data["status"] == "healthy"
+        assert "timestamp" in data
+
+    def test_error_handling_flow(self, client, mock_dependencies):
+        """Test error handling in coverage flow"""
+        with patch("apps.svc_coverage.main.CoverageEvaluator") as mock_evaluator_class:
+            mock_evaluator = AsyncMock()
+            mock_evaluator_class.return_value = mock_evaluator
+
+            # Mock evaluator to raise exception
+            mock_evaluator.check_document_coverage.side_effect = Exception(
+                "KG connection failed"
+            )
+
+            # Call coverage check endpoint
+            response = client.post(
+                "/v1/coverage/check",
+                json={
+                    "taxpayer_id": "T-001",
+                    "tax_year": "2024-25",
+                    "jurisdiction": "UK",
+                },
+            )
+
+            assert response.status_code == 500
+            data = response.json()
+
+            assert "error" in data["detail"]
+
+    def test_invalid_request_flow(self, client, mock_dependencies):
+        """Test validation of invalid requests"""
+        # Missing required fields
+        response = client.post(
+            "/v1/coverage/check",
+            json={
+                "taxpayer_id": "T-001",
+                # Missing tax_year and jurisdiction
+            },
+        )
+
+        assert response.status_code == 422  # Validation error
+
+    def test_unauthorized_admin_flow(self, client, mock_dependencies):
+        """Test unauthorized access to admin endpoints"""
+        with patch("apps.svc_coverage.main.check_admin_permission") as mock_admin:
+            mock_admin.return_value = False
+
+            response = client.post(
+                "/admin/coverage/reload", headers={"Authorization": "Bearer user-token"}
+            )
+
+            assert response.status_code == 403
+
+    def test_concurrent_requests_flow(self, client, mock_dependencies):
+        """Test handling of concurrent requests"""
+        with patch("apps.svc_coverage.main.CoverageEvaluator") as mock_evaluator_class:
+            mock_evaluator = AsyncMock()
+            mock_evaluator_class.return_value = mock_evaluator
+
+            # Mock successful response
+            mock_evaluator.check_document_coverage.return_value = AsyncMock(
+                overall_status=OverallStatus.OK,
+                schedules_required=[],
+                coverage=[],
+                blocking_items=[],
+            )
+
+            # Make multiple concurrent requests
+            responses = []
+            for i in range(5):
+                response = client.post(
+                    "/v1/coverage/check",
+                    json={
+                        "taxpayer_id": f"T-{i:03d}",
+                        "tax_year": "2024-25",
+                        "jurisdiction": "UK",
+                    },
+                )
+                responses.append(response)
+
+            # All should succeed
+            for response in responses:
+                assert response.status_code == 200
+                data = response.json()
+                assert data["overall_status"] == "OK"
--- a/tests/e2e/test_happy_path.py
+++ b/tests/e2e/test_happy_path.py
@@ -0,0 +1,555 @@
+# ROLE
+
+You are a **Senior Platform Engineer + Backend Lead** generating **production code** and **ops assets** for a microservice suite that powers an accounting Knowledge Graph + Vector RAG platform. Authentication/authorization are centralized at the **edge via Traefik + Authentik** (ForwardAuth). **Services are trust-bound** to Traefik and consume user/role claims via forwarded headers/JWT.
+
+# MISSION
+
+Produce fully working code for **all application services** (FastAPI + Python 3.12) with:
+
+- Solid domain models, Pydantic v2 schemas, type hints, strict mypy, ruff lint.
+- Opentelemetry tracing, Prometheus metrics, structured logging.
+- Vault-backed secrets, MinIO S3 client, Qdrant client, Neo4j driver, Postgres (SQLAlchemy), Redis.
+- Eventing (Kafka or SQS/SNS behind an interface).
+- Deterministic data contracts, end-to-end tests, Dockerfiles, Compose, CI for Gitea.
+- Traefik labels + Authentik Outpost integration for every exposed route.
+- Zero PII in vectors (Qdrant), evidence-based lineage in KG, and bitemporal writes.
+
+# GLOBAL CONSTRAINTS (APPLY TO ALL SERVICES)
+
+- **Language & Runtime:** Python **3.12**.
+- **Frameworks:** FastAPI, Pydantic v2, SQLAlchemy 2, httpx, aiokafka or boto3 (pluggable), redis-py, opentelemetry-instrumentation-fastapi, prometheus-fastapi-instrumentator.
+- **Config:** `pydantic-settings` with `.env` overlay. Provide `Settings` class per service.
+- **Secrets:** HashiCorp **Vault** (AppRole/JWT). Use Vault Transit to **envelope-encrypt** sensitive fields before persistence (helpers provided in `lib/security.py`).
+- **Auth:** No OIDC in services. Add `TrustedProxyMiddleware`:
+
+  - Reject if request not from internal network (configurable CIDR).
+  - Require headers set by Traefik+Authentik (`X-Authenticated-User`, `X-Authenticated-Email`, `X-Authenticated-Groups`, `Authorization: Bearer …`).
+  - Parse groups → `roles` list on `request.state`.
+
+- **Observability:**
+
+  - OpenTelemetry (traceparent propagation), span attrs (service, route, user, tenant).
+  - Prometheus metrics endpoint `/metrics` protected by internal network check.
+  - Structured JSON logs (timestamp, level, svc, trace_id, msg) via `structlog`.
+
+- **Errors:** Global exception handler → RFC7807 Problem+JSON (`type`, `title`, `status`, `detail`, `instance`, `trace_id`).
+- **Testing:** `pytest`, `pytest-asyncio`, `hypothesis` (property tests for calculators), `coverage ≥ 90%` per service.
+- **Static:** `ruff`, `mypy --strict`, `bandit`, `safety`, `licensecheck`.
+- **Perf:** Each service exposes `/healthz`, `/readyz`, `/livez`; cold start < 500ms; p95 endpoint < 250ms (local).
+- **Containers:** Distroless or slim images; non-root user; read-only FS; `/tmp` mounted for OCR where needed.
+- **Docs:** OpenAPI JSON + ReDoc; MkDocs site with service READMEs.
+
+# SHARED LIBS (GENERATE ONCE, REUSE)
+
+Create `libs/` used by all services:
+
+- `libs/config.py` – base `Settings`, env parsing, Vault client factory, MinIO client factory, Qdrant client factory, Neo4j driver factory, Redis factory, Kafka/SQS client factory.
+- `libs/security.py` – Vault Transit helpers (`encrypt_field`, `decrypt_field`), header parsing, internal-CIDR validator.
+- `libs/observability.py` – otel init, prometheus instrumentor, logging config.
+- `libs/events.py` – abstract `EventBus` with `publish(topic, payload: dict)`, `subscribe(topic, handler)`. Two impls: Kafka (`aiokafka`) and SQS/SNS (`boto3`).
+- `libs/schemas.py` – **canonical Pydantic models** shared across services (Document, Evidence, IncomeItem, etc.) mirroring the ontology schemas. Include JSONSchema exports.
+- `libs/storage.py` – S3/MinIO helpers (bucket ensure, put/get, presigned).
+- `libs/neo.py` – Neo4j session helpers, Cypher runner with retry, SHACL validator invoker (pySHACL on exported RDF).
+- `libs/rag.py` – Qdrant collections CRUD, hybrid search (dense+sparse), rerank wrapper, de-identification utilities (regex + NER; hash placeholders).
+- `libs/forms.py` – PDF AcroForm fill via `pdfrw` with overlay fallback via `reportlab`.
+- `libs/calibration.py` – `calibrated_confidence(raw_score, method="temperature_scaling", params=...)`.
+
+# EVENT TOPICS (STANDARDIZE)
+
+- `doc.ingested`, `doc.ocr_ready`, `doc.extracted`, `kg.upserted`, `rag.indexed`, `calc.schedule_ready`, `form.filled`, `hmrc.submitted`, `review.requested`, `review.completed`, `firm.sync.completed`
+
+Each payload MUST include: `event_id (ulid)`, `occurred_at (iso)`, `actor`, `tenant_id`, `trace_id`, `schema_version`, and a `data` object (service-specific).
+
+# TRUST HEADERS FROM TRAEFIK + AUTHENTIK (USE EXACT KEYS)
+
+- `X-Authenticated-User` (string)
+- `X-Authenticated-Email` (string)
+- `X-Authenticated-Groups` (comma-separated)
+- `Authorization` (`Bearer <jwt>` from Authentik)
+  Reject any request missing these (except `/healthz|/readyz|/livez|/metrics` from internal CIDR).
+
+---
+
+## SERVICES TO IMPLEMENT (CODE FOR EACH)
+
+### 1) `svc-ingestion`
+
+**Purpose:** Accept uploads or URLs, checksum, store to MinIO, emit `doc.ingested`.
+
+**Endpoints:**
+
+- `POST /v1/ingest/upload` (multipart file, metadata: `tenant_id`, `kind`, `source`) → `{doc_id, s3_url, checksum}`
+- `POST /v1/ingest/url` (json: `{url, kind, tenant_id}`) → downloads to MinIO
+- `GET /v1/docs/{doc_id}` → metadata
+
+**Logic:**
+
+- Compute SHA256, dedupe by checksum; MinIO path `tenants/{tenant_id}/raw/{doc_id}.pdf`.
+- Store metadata in Postgres table `ingest_documents` (alembic migrations).
+- Publish `doc.ingested` with `{doc_id, bucket, key, pages?, mime}`.
+
+**Env:** `S3_BUCKET_RAW`, `MINIO_*`, `DB_URL`.
+
+**Traefik labels:** route `/ingest/*`.
+
+---
+
+### 2) `svc-rpa`
+
+**Purpose:** Scheduled RPA pulls from firm/client portals via Playwright.
+
+**Tasks:**
+
+- Playwright login flows (credentials from Vault), 2FA via Authentik OAuth device or OTP secret in Vault.
+- Download statements/invoices; hand off to `svc-ingestion` via internal POST.
+- Prefect flows: `pull_portal_X()`, `pull_portal_Y()` with schedules.
+
+**Endpoints:**
+
+- `POST /v1/rpa/run/{connector}` (manual trigger)
+- `GET /v1/rpa/status/{run_id}`
+
+**Env:** `VAULT_ADDR`, `VAULT_ROLE_ID`, `VAULT_SECRET_ID`.
+
+---
+
+### 3) `svc-ocr`
+
+**Purpose:** OCR & layout extraction.
+
+**Pipeline:**
+
+- Pull object from MinIO, detect rotation/de-skew (`opencv-python`), split pages (`pymupdf`), OCR (`pytesseract`) or bypass if text layer present (`pdfplumber`).
+- Output per-page text + **bbox** for lines/words.
+- Write JSON to MinIO `tenants/{tenant_id}/ocr/{doc_id}.json` and emit `doc.ocr_ready`.
+
+**Endpoints:**
+
+- `POST /v1/ocr/{doc_id}` (idempotent trigger)
+- `GET /v1/ocr/{doc_id}` (fetch OCR JSON)
+
+**Env:** `TESSERACT_LANGS`, `S3_BUCKET_EVIDENCE`.
+
+---
+
+### 4) `svc-extract`
+
+**Purpose:** Classify docs and extract KV + tables into **schema-constrained JSON** (with bbox/page).
+
+**Endpoints:**
+
+- `POST /v1/extract/{doc_id}` body: `{strategy: "llm|rules|hybrid"}`
+- `GET /v1/extract/{doc_id}` → structured JSON
+
+**Implementation:**
+
+- Use prompt files in `prompts/`: `doc_classify.txt`, `kv_extract.txt`, `table_extract.txt`.
+- **Validator loop**: run LLM → validate JSONSchema → retry with error messages up to N times.
+- Return Pydantic models from `libs/schemas.py`.
+- Emit `doc.extracted`.
+
+**Env:** `LLM_ENGINE`, `TEMPERATURE`, `MAX_TOKENS`.
+
+---
+
+### 5) `svc-normalize-map`
+
+**Purpose:** Normalize & map extracted data to KG.
+
+**Logic:**
+
+- Currency normalization (ECB or static fx table), dates, UK tax year/basis period inference.
+- Entity resolution (blocking + fuzzy).
+- Generate nodes/edges (+ `Evidence` with doc_id/page/bbox/text_hash).
+- Use `libs/neo.py` to write with **bitemporal** fields; run **SHACL** validator; on violation, queue `review.requested`.
+- Emit `kg.upserted`.
+
+**Endpoints:**
+
+- `POST /v1/map/{doc_id}`
+- `GET /v1/map/{doc_id}/preview` (diff view, to be used by UI)
+
+**Env:** `NEO4J_*`.
+
+---
+
+### 6) `svc-kg`
+
+**Purpose:** Graph façade + RDF/SHACL utility.
+
+**Endpoints:**
+
+- `GET /v1/kg/nodes/{label}/{id}`
+- `POST /v1/kg/cypher` (admin-gated inline query; must check `admin` role)
+- `POST /v1/kg/export/rdf` (returns RDF for SHACL)
+- `POST /v1/kg/validate` (run pySHACL against `schemas/shapes.ttl`)
+- `GET /v1/kg/lineage/{node_id}` (traverse `DERIVED_FROM` → Evidence)
+
+**Env:** `NEO4J_*`.
+
+---
+
+### 7) `svc-rag-indexer`
+
+**Purpose:** Build Qdrant indices (firm knowledge, legislation, best practices, glossary).
+
+**Workflow:**
+
+- Load sources (filesystem, URLs, Firm DMS via `svc-firm-connectors`).
+- **De-identify PII** (regex + NER), replace with placeholders; store mapping only in Postgres.
+- Chunk (layout-aware) per `retrieval/chunking.yaml`.
+- Compute **dense** embeddings (e.g., `bge-small-en-v1.5`) and **sparse** (Qdrant sparse).
+- Upsert to Qdrant with payload `{jurisdiction, tax_years[], topic_tags[], version, pii_free: true, doc_id/section_id/url}`.
+- Emit `rag.indexed`.
+
+**Endpoints:**
+
+- `POST /v1/index/run`
+- `GET /v1/index/status/{run_id}`
+
+**Env:** `QDRANT_URL`, `RAG_EMBEDDING_MODEL`, `RAG_RERANKER_MODEL`.
+
+---
+
+### 8) `svc-rag-retriever`
+
+**Purpose:** Hybrid search + KG fusion with rerank and calibrated confidence.
+
+**Endpoint:**
+
+- `POST /v1/rag/search` `{query, tax_year?, jurisdiction?, k?}` →
+
+  ```
+  {
+    "chunks": [...],
+    "citations": [{doc_id|url, section_id?, page?, bbox?}],
+    "kg_hints": [{rule_id, formula_id, node_ids[]}],
+    "calibrated_confidence": 0.0-1.0
+  }
+  ```
+
+**Implementation:**
+
+- Hybrid score: `alpha * dense + beta * sparse`; rerank top-K via cross-encoder; **KG fusion** (boost chunks citing Rules/Calculations relevant to schedule).
+- Use `libs/calibration.py` to expose calibrated confidence.
+
+---
+
+### 9) `svc-reason`
+
+**Purpose:** Deterministic calculators + materializers (UK SA).
+
+**Endpoints:**
+
+- `POST /v1/reason/compute_schedule` `{tax_year, taxpayer_id, schedule_id}`
+- `GET /v1/reason/explain/{schedule_id}` → rationale & lineage paths
+
+**Implementation:**
+
+- Pure functions for: employment, self-employment, property (FHL, 20% interest credit), dividends/interest, allowances, NIC (Class 2/4), HICBC, student loans (Plans 1/2/4/5, PGL).
+- **Deterministic order** as defined; rounding per `FormBox.rounding_rule`.
+- Use Cypher from `kg/reasoning/schedule_queries.cypher` to materialize box values; attach `DERIVED_FROM` evidence.
+
+---
+
+### 10) `svc-forms`
+
+**Purpose:** Fill PDFs and assemble evidence bundles.
+
+**Endpoints:**
+
+- `POST /v1/forms/fill` `{tax_year, taxpayer_id, form_id}` → returns PDF (binary)
+- `POST /v1/forms/evidence_pack` `{scope}` → ZIP + manifest + signed hashes (sha256)
+
+**Implementation:**
+
+- `pdfrw` for AcroForm; overlay with ReportLab if needed.
+- Manifest includes `doc_id/page/bbox/text_hash` for every numeric field.
+
+---
+
+### 11) `svc-hmrc`
+
+**Purpose:** HMRC submitter (stub|sandbox|live).
+
+**Endpoints:**
+
+- `POST /v1/hmrc/submit` `{tax_year, taxpayer_id, dry_run}` → `{status, submission_id?, errors[]}`
+- `GET /v1/hmrc/submissions/{id}`
+
+**Implementation:**
+
+- Rate limits, retries/backoff, signed audit log; environment toggle.
+
+---
+
+### 12) `svc-firm-connectors`
+
+**Purpose:** Read-only connectors to Firm Databases (Practice Mgmt, DMS).
+
+**Endpoints:**
+
+- `POST /v1/firm/sync` `{since?}` → `{objects_synced, errors[]}`
+- `GET /v1/firm/objects` (paged)
+
+**Implementation:**
+
+- Data contracts in `config/firm_contracts/`; mappers → Secure Client Data Store (Postgres) with lineage columns (`source`, `source_id`, `synced_at`).
+
+---
+
+### 13) `ui-review` (outline only)
+
+- Next.js (SSO handled by Traefik+Authentik), shows extracted fields + evidence snippets; POST overrides to `svc-extract`/`svc-normalize-map`.
+
+---
+
+## DATA CONTRACTS (ESSENTIAL EXAMPLES)
+
+**Event: `doc.ingested`**
+
+```json
+{
+  "event_id": "01J...ULID",
+  "occurred_at": "2025-09-13T08:00:00Z",
+  "actor": "svc-ingestion",
+  "tenant_id": "t_123",
+  "trace_id": "abc-123",
+  "schema_version": "1.0",
+  "data": {
+    "doc_id": "d_abc",
+    "bucket": "raw",
+    "key": "tenants/t_123/raw/d_abc.pdf",
+    "checksum": "sha256:...",
+    "kind": "bank_statement",
+    "mime": "application/pdf",
+    "pages": 12
+  }
+}
+```
+
+**RAG search response shape**
+
+```json
+{
+  "chunks": [
+    {
+      "id": "c1",
+      "text": "...",
+      "score": 0.78,
+      "payload": {
+        "jurisdiction": "UK",
+        "tax_years": ["2024-25"],
+        "topic_tags": ["FHL"],
+        "pii_free": true
+      }
+    }
+  ],
+  "citations": [
+    { "doc_id": "leg-ITA2007", "section_id": "s272A", "url": "https://..." }
+  ],
+  "kg_hints": [
+    {
+      "rule_id": "UK.FHL.Qual",
+      "formula_id": "FHL_Test_v1",
+      "node_ids": ["n123", "n456"]
+    }
+  ],
+  "calibrated_confidence": 0.81
+}
+```
+
+---
+
+## PERSISTENCE SCHEMAS (POSTGRES; ALEMBIC)
+
+- `ingest_documents(id pk, tenant_id, doc_id, kind, checksum, bucket, key, mime, pages, created_at)`
+- `firm_objects(id pk, tenant_id, source, source_id, type, payload jsonb, synced_at)`
+- Qdrant PII mapping table (if absolutely needed): `pii_links(id pk, placeholder_hash, client_id, created_at)` — **encrypt with Vault Transit**; do NOT store raw values.
+
+---
+
+## TRAEFIK + AUTHENTIK (COMPOSE LABELS PER SERVICE)
+
+For every service container in `infra/compose/docker-compose.local.yml`, add labels:
+
+```
+- "traefik.enable=true"
+- "traefik.http.routers.svc-extract.rule=Host(`api.local`) && PathPrefix(`/extract`)"
+- "traefik.http.routers.svc-extract.entrypoints=websecure"
+- "traefik.http.routers.svc-extract.tls=true"
+- "traefik.http.routers.svc-extract.middlewares=authentik-forwardauth,rate-limit"
+- "traefik.http.services.svc-extract.loadbalancer.server.port=8000"
+```
+
+Use the shared dynamic file `traefik-dynamic.yml` with `authentik-forwardauth` and `rate-limit` middlewares.
+
+---
+
+## OUTPUT FORMAT (STRICT)
+
+Implement a **multi-file codebase** as fenced blocks, EXACTLY in this order:
+
+```txt
+# FILE: libs/config.py
+# factories for Vault/MinIO/Qdrant/Neo4j/Redis/EventBus, Settings base
+...
+```
+
+```txt
+# FILE: libs/security.py
+# Vault Transit helpers, header parsing, internal CIDR checks, middleware
+...
+```
+
+```txt
+# FILE: libs/observability.py
+# otel init, prometheus, structlog
+...
+```
+
+```txt
+# FILE: libs/events.py
+# EventBus abstraction with Kafka and SQS/SNS impls
+...
+```
+
+```txt
+# FILE: libs/schemas.py
+# Shared Pydantic models mirroring ontology entities
+...
+```
+
+```txt
+# FILE: apps/svc-ingestion/main.py
+# FastAPI app, endpoints, MinIO write, Postgres, publish doc.ingested
+...
+```
+
+```txt
+# FILE: apps/svc-rpa/main.py
+# Playwright flows, Prefect tasks, triggers
+...
+```
+
+```txt
+# FILE: apps/svc-ocr/main.py
+# OCR pipeline, endpoints
+...
+```
+
+```txt
+# FILE: apps/svc-extract/main.py
+# Classifier + extractors with validator loop
+...
+```
+
+```txt
+# FILE: apps/svc-normalize-map/main.py
+# normalization, entity resolution, KG mapping, SHACL validation call
+...
+```
+
+```txt
+# FILE: apps/svc-kg/main.py
+# KG façade, RDF export, SHACL validate, lineage traversal
+...
+```
+
+```txt
+# FILE: apps/svc-rag-indexer/main.py
+# chunk/de-id/embed/upsert to Qdrant
+...
+```
+
+```txt
+# FILE: apps/svc-rag-retriever/main.py
+# hybrid retrieval + rerank + KG fusion
+...
+```
+
+```txt
+# FILE: apps/svc-reason/main.py
+# deterministic calculators, schedule compute/explain
+...
+```
+
+```txt
+# FILE: apps/svc-forms/main.py
+# PDF fill + evidence pack
+...
+```
+
+```txt
+# FILE: apps/svc-hmrc/main.py
+# submit stub|sandbox|live with audit + retries
+...
+```
+
+```txt
+# FILE: apps/svc-firm-connectors/main.py
+# connectors to practice mgmt & DMS, sync to Postgres
+...
+```
+
+```txt
+# FILE: infra/compose/docker-compose.local.yml
+# Traefik, Authentik, Vault, MinIO, Qdrant, Neo4j, Postgres, Redis, Prom+Grafana, Loki, Unleash, all services
+...
+```
+
+```txt
+# FILE: infra/compose/traefik.yml
+# static Traefik config
+...
+```
+
+```txt
+# FILE: infra/compose/traefik-dynamic.yml
+# forwardAuth middleware + routers/services
+...
+```
+
+```txt
+# FILE: .gitea/workflows/ci.yml
+# lint->test->build->scan->push->deploy
+...
+```
+
+```txt
+# FILE: Makefile
+# bootstrap, run, test, lint, build, deploy, format, seed
+...
+```
+
+```txt
+# FILE: tests/e2e/test_happy_path.py
+# end-to-end: ingest -> ocr -> extract -> map -> compute -> fill -> (stub) submit
+...
+```
+
+```txt
+# FILE: tests/unit/test_calculators.py
+# boundary tests for UK SA logic (NIC, HICBC, PA taper, FHL)
+...
+```
+
+```txt
+# FILE: README.md
+# how to run locally with docker-compose, Authentik setup, Traefik certs
+...
+```
+
+## DEFINITION OF DONE
+
+- `docker compose up` brings the full stack up; SSO via Authentik; routes secured via Traefik ForwardAuth.
+- Running `pytest` yields ≥ 90% coverage; `make e2e` passes the ingest→…→submit stub flow.
+- All services expose `/healthz|/readyz|/livez|/metrics`; OpenAPI at `/docs`.
+- No PII stored in Qdrant; vectors carry `pii_free=true`.
+- KG writes are SHACL-validated; violations produce `review.requested` events.
+- Evidence lineage is present for every numeric box value.
+- Gitea pipeline passes: lint, test, build, scan, push, deploy.
+
+# START
+
+Generate the full codebase and configs in the **exact file blocks and order** specified above.