deployment, linting and infra configuration

2025-10-14 07:42:31 +01:00
parent f0f7674b8d
commit eea46ac89c
41 changed files with 1017 additions and 1448 deletions
--- a/apps/svc_coverage/models.py
+++ b/apps/svc_coverage/models.py
@@ -1,6 +1,6 @@
 """Database models for coverage service."""
-# FILE: apps/svc-coverage/models.py
+# FILE: apps/svc_coverage/models.py
 from datetime import datetime
--- a/apps/svc_coverage/requirements.txt
+++ b/apps/svc_coverage/requirements.txt
@@ -0,0 +1,13 @@
 # Service-specific dependencies for svc_coverage
 # Database migrations
 alembic>=1.14.0
 # OpenTelemetry (required by libs.observability)
 opentelemetry-api>=1.37.0
 opentelemetry-sdk>=1.37.0
 opentelemetry-exporter-otlp-proto-grpc>=1.37.0
 opentelemetry-instrumentation-fastapi>=0.42b0
 opentelemetry-instrumentation-httpx>=0.42b0
 opentelemetry-instrumentation-psycopg2>=0.42b0
 opentelemetry-instrumentation-redis>=0.42b0
--- a/apps/svc_extract/requirements.txt
+++ b/apps/svc_extract/requirements.txt
@@ -1,17 +1,17 @@
 # Service-specific dependencies for svc_extract
 # LLM integration
-openai>=1.3.0
+openai>=2.3.0
-anthropic>=0.7.0
+anthropic>=0.69.0
 # JSON schema validation
-jsonschema>=4.20.0
+jsonschema>=4.25.1
 # Template processing
-jinja2>=3.1.0
+jinja2>=3.1.6
 # Text similarity (lightweight)
 fuzzywuzzy>=0.18.0
-python-Levenshtein>=0.23.0
+python-Levenshtein>=0.27.1
 # Data validation
-cerberus>=1.3.4
+cerberus>=1.3.7
--- a/apps/svc_firm_connectors/requirements.txt
+++ b/apps/svc_firm_connectors/requirements.txt
@@ -1,45 +1,45 @@
 # FastAPI and server
-fastapi>=0.104.1
+fastapi>=0.118.3
-uvicorn[standard]>=0.24.0
+uvicorn[standard]>=0.37.0
 pydantic>=2.5.0
 # Service-specific dependencies
 # Database connectors
-sqlalchemy>=2.0.0
+sqlalchemy>=2.0.44
-pymssql>=2.2.0
+pymssql>=2.3.7
 cx-Oracle>=8.3.0
 # API clients for practice management systems
-zeep>=4.2.0  # SOAP client
+zeep>=4.3.2  # SOAP client
-xmltodict>=0.13.0
+xmltodict>=1.0.2
 # OAuth for various systems
-authlib>=1.2.0
+authlib>=1.6.5
-requests-oauthlib>=1.3.0
+requests-oauthlib>=2.0.0
 # Data synchronization
-pandas>=2.1.0
+pandas>=2.3.3
 # Rate limiting
-ratelimit>=2.2.0
+ratelimit>=2.2.1
 # Retry mechanisms
-tenacity>=8.2.0
+tenacity>=9.1.2
 # CSV processing
-csvkit>=1.1.0
+csvkit>=2.1.0
 # Excel file processing
-openpyxl>=3.1.0
+openpyxl>=3.1.5
-xlrd>=2.0.0
+xlrd>=2.0.2
 # Data validation
-marshmallow>=3.20.0
+marshmallow>=4.0.1
-cerberus>=1.3.4
+cerberus>=1.3.7
 # Connection pooling (built into SQLAlchemy)
 # sqlalchemy-pool>=1.3.0  # Package doesn't exist, pooling is built into SQLAlchemy
 # Additional utilities
-python-dateutil>=2.8.0
+python-dateutil>=2.9.0
-pytz>=2023.3
+pytz>=2025.2
--- a/apps/svc_forms/requirements.txt
+++ b/apps/svc_forms/requirements.txt
@@ -1,37 +1,37 @@
 # FastAPI and server
-fastapi>=0.104.1
+fastapi>=0.118.3
-uvicorn[standard]>=0.24.0
+uvicorn[standard]>=0.37.0
-pydantic>=2.5.0
+pydantic>=2.12.0
 # Service-specific dependencies
 # PDF form filling
 pdfrw>=0.4
-reportlab>=4.0.0
+reportlab>=4.4.4
 # PDF processing
-PyPDF2>=3.0.0
+PyPDF2>=3.0.1
-pypdf>=3.17.0
+pypdf>=6.1.1
 # Image processing for overlays
-Pillow>=10.1.0
+Pillow>=11.3.0
 # ZIP file creation for evidence packs
 zipfile36>=0.1.3
 # Template processing
-jinja2>=3.1.0
+jinja2>=3.1.6
 # QR code generation
-qrcode>=7.4.0
+qrcode>=8.2
 # Barcode generation
-python-barcode>=0.15.0
+python-barcode>=0.16.1
 # Font handling
-fonttools>=4.44.0
+fonttools>=4.60.1
 # Additional PDF utilities
-pdfminer.six>=20231228
+pdfminer.six>=20250506
 # Document conversion
-python-docx>=1.1.0
+python-docx>=1.2.0
--- a/apps/svc_hmrc/requirements.txt
+++ b/apps/svc_hmrc/requirements.txt
@@ -1,40 +1,40 @@
 # FastAPI and server
-fastapi>=0.104.1
+fastapi>=0.118.3
-uvicorn[standard]>=0.24.0
+uvicorn[standard]>=0.37.0
-pydantic>=2.5.0
+pydantic>=2.12.0
 # Service-specific dependencies
 # OAuth and authentication
-authlib>=1.2.0
+authlib>=1.6.5
-oauthlib>=3.2.0
+oauthlib>=3.3.1
 # HTTP client with OAuth support
-requests-oauthlib>=1.3.0
+requests-oauthlib>=2.0.0
 # XML processing for HMRC APIs
-lxml>=4.9.0
+lxml>=6.0.2
-xmltodict>=0.13.0
+xmltodict>=1.0.2
 # JSON Web Tokens
-pyjwt>=2.8.0
+pyjwt>=2.10.1
 # UK government API utilities
-govuk-frontend-jinja>=2.8.0
+govuk-frontend-jinja>=3.8.0
 # Date and time for tax years
-python-dateutil>=2.8.0
+python-dateutil>=2.9.0
 # Retry mechanisms
-tenacity>=8.2.0
+tenacity>=9.1.2
 # Rate limiting
-ratelimit>=2.2.0
+ratelimit>=2.2.1
 # API validation
-marshmallow>=3.20.0
+marshmallow>=4.0.1
 # Encryption for sensitive data
-cryptography>=41.0.0
+cryptography>=46.0.2
 # Additional HTTP utilities
-urllib3>=2.1.0
+urllib3>=2.5.0
--- a/apps/svc_kg/requirements.txt
+++ b/apps/svc_kg/requirements.txt
@@ -1,22 +1,22 @@
 # Service-specific dependencies
 # RDF and semantic web
-rdflib>=7.0.0
+rdflib>=7.2.1
-pyshacl>=0.25.0
+pyshacl>=0.30.1
 # Graph algorithms
-networkx>=3.2.0
+networkx>=3.5
 # Data export formats
-xmltodict>=0.13.0
+xmltodict>=1.0.2
 # Query optimization
-pyparsing>=3.1.0
+pyparsing>=3.2.5
 # Graph visualization (optional)
-graphviz>=0.20.0
+graphviz>=0.21
 # Additional Neo4j utilities
-neomodel>=5.2.0
+neomodel>=5.5.3
 # Cypher query building
 py2neo>=2021.2.4
--- a/apps/svc_normalize_map/requirements.txt
+++ b/apps/svc_normalize_map/requirements.txt
@@ -1,37 +1,37 @@
 # FastAPI and server
-fastapi>=0.104.1
+fastapi>=0.118.3
-uvicorn[standard]>=0.24.0
+uvicorn[standard]>=0.37.0
-pydantic>=2.5.0
+pydantic>=2.12.0
 # Service-specific dependencies
 # Data normalization and cleaning
-pandas>=2.1.0
+pandas>=2.3.3
-numpy>=1.24.0
+numpy>=2.3.3
 # Currency and exchange rates
-forex-python>=1.8
+forex-python>=1.9.2
-babel>=2.13.0
+babel>=2.17.0
 # Date and time processing
-python-dateutil>=2.8.0
+python-dateutil>=2.9.0
-pytz>=2023.3
+pytz>=2025.2
 # Text normalization
-unidecode>=1.3.0
+unidecode>=1.4.0
-phonenumbers>=8.13.0
+phonenumbers>=9.0.16
 # Entity resolution and matching
 recordlinkage>=0.16.0
 fuzzywuzzy>=0.18.0
-python-Levenshtein>=0.23.0
+python-Levenshtein>=0.27.1
 # Geographic data
-geopy>=2.4.0
+geopy>=2.4.1
-pycountry>=23.12.0
+pycountry>=24.6.1
 # Data validation
-cerberus>=1.3.4
+cerberus>=1.3.7
-marshmallow>=3.20.0
+marshmallow>=4.0.1
 # UK-specific utilities
-uk-postcode-utils>=1.0.0
+uk-postcode-utils>=1.1
--- a/apps/svc_ocr/main.py
+++ b/apps/svc_ocr/main.py
@@ -1,17 +1,23 @@
 # FILE: apps/svc-ocr/main.py
 # OCR and layout extraction using Tesseract, LayoutLM, and document AI
 import asyncio
 import io
 import os
 # Import shared libraries
 import sys
 from datetime import datetime
-from typing import Any
+from typing import Any, cast
 import pytesseract
 import structlog
 import ulid
 from fastapi import BackgroundTasks, Depends, HTTPException, Request
 from fastapi.responses import JSONResponse
 from pdf2image import convert_from_bytes
 from PIL import Image
 from PyPDF2 import PdfReader
 sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
@@ -19,6 +25,7 @@ from libs.app_factory import create_app
 from libs.config import BaseAppSettings, create_event_bus, create_minio_client
 from libs.events import EventBus, EventPayload, EventTopics
 from libs.observability import get_metrics, get_tracer, setup_observability
 from libs.ocr.processor import OCRProcessor
 from libs.schemas import ErrorResponse
 from libs.security import get_current_user, get_tenant_id
 from libs.storage import DocumentStorage, StorageClient
@@ -48,28 +55,31 @@ class OCRSettings(BaseAppSettings):
    include_coordinates: bool = True
    include_confidence: bool = True
    # Vision/LLM OCR configuration
    vision_provider: str = "ollama"  # or "openai"
    vision_model: str = "llama3.2-vision:11b"
    vision_format: str = (
        "text"  # text | markdown | json | table | key_value | structured
    )
    vision_preprocess: bool = True
    openai_base_url: str = "https://api.openai.com/v1/chat/completions"
 # Create app and settings
 app, settings = create_app(
    service_name="svc-ocr",
    title="Tax Agent OCR Service",
    description="OCR and layout extraction service",
    settings_class=OCRSettings,
 )  # fmt: skip
 # Global clients
 storage_client: StorageClient | None = None
 document_storage: DocumentStorage | None = None
 event_bus: EventBus | None = None
-tracer = get_tracer("svc-ocr")
+
-metrics = get_metrics()
+vision_processor: OCRProcessor | None = None
 # Settings will be initialized after app creation
 settings: OCRSettings
-@app.on_event("startup")
+async def init_dependencies(app_settings: OCRSettings) -> None:
 async def startup_event() -> None:
    """Initialize service dependencies"""
-    global storage_client, document_storage, event_bus
+    global storage_client, document_storage, event_bus, settings, vision_processor
    settings = app_settings
    logger.info("Starting OCR service")
    # Setup observability
@@ -79,42 +89,44 @@ async def startup_event() -> None:
    minio_client = create_minio_client(settings)
    storage_client = StorageClient(minio_client)
    document_storage = DocumentStorage(storage_client)
    # Initialize event bus
    event_bus = create_event_bus(settings)
    if not event_bus:
        raise HTTPException(status_code=500, detail="Event bus not initialized")
-    await event_bus.start()
+    eb = event_bus
    # mypy: event_bus is Optional, so use local alias after check
    await eb.start()
    # Subscribe to document ingestion events
-    await event_bus.subscribe(EventTopics.DOC_INGESTED, _handle_document_ingested)
+    await eb.subscribe(EventTopics.DOC_INGESTED, _handle_document_ingested)
    # Initialize shared OCRProcessor for vision strategy
    try:
        vision_processor = OCRProcessor(
            model_name=settings.vision_model,
            provider=settings.vision_provider,
            openai_base_url=settings.openai_base_url,
        )
    except Exception as e:
        logger.error("Failed to initialize vision OCR processor", error=str(e))
    logger.info("OCR service started successfully")
-@app.on_event("shutdown")
+# Create app and settings
-async def shutdown_event() -> None:
+app, _settings = create_app(
-    """Cleanup service dependencies"""
+    service_name="svc-ocr",
-    global event_bus
+    title="Tax Agent OCR Service",
    description="OCR and layout extraction service",
    settings_class=OCRSettings,
 )  # fmt: skip
-    logger.info("Shutting down OCR service")
+# Initialize dependencies immediately
 asyncio.run(init_dependencies(cast(OCRSettings, _settings)))
-    if event_bus:
+tracer = get_tracer("svc-ocr")
-        await event_bus.stop()
+metrics = get_metrics()
    logger.info("OCR service shutdown complete")
@app.get("/health")
 async def health_check() -> dict[str, Any]:
    """Health check endpoint"""
    return {
        "status": "healthy",
        "service": settings.service_name,
        "version": settings.service_version,
        "timestamp": datetime.utcnow().isoformat(),
    }
@app.post("/process/{doc_id}")
@@ -132,9 +144,14 @@ async def process_document(
        span.set_attribute("tenant_id", tenant_id)
        span.set_attribute("strategy", strategy)
        ds = document_storage
        if ds is None:
            raise HTTPException(
                status_code=500, detail="Document storage not initialized"
            )
        try:
            # Check if document exists
-            doc_content = await document_storage.get_document(tenant_id, doc_id)
+            doc_content = await ds.get_document(tenant_id, doc_id)
            if not doc_content:
                raise HTTPException(status_code=404, detail="Document not found")
@@ -142,9 +159,9 @@ async def process_document(
            processing_id = str(ulid.new())
            span.set_attribute("processing_id", processing_id)
-            # Start background processing
+            # Start background processing via sync wrapper (for mypy correctness)
            background_tasks.add_task(
-                _process_document_async,
+                _schedule_process_document_async,
                doc_id,
                tenant_id,
                doc_content,
@@ -168,7 +185,9 @@ async def process_document(
            raise
        except Exception as e:
            logger.error("Failed to start OCR processing", doc_id=doc_id, error=str(e))
-            raise HTTPException(status_code=500, detail="Failed to start processing")
+            raise HTTPException(
                status_code=500, detail="Failed to start processing"
            ) from e
@app.get("/results/{doc_id}")
@@ -183,9 +202,14 @@ async def get_ocr_results(
        span.set_attribute("doc_id", doc_id)
        span.set_attribute("tenant_id", tenant_id)
        ds = document_storage
        if ds is None:
            raise HTTPException(
                status_code=500, detail="Document storage not initialized"
            )
        try:
            # Get OCR results from storage
-            ocr_results = await document_storage.get_ocr_result(tenant_id, doc_id)
+            ocr_results = await ds.get_ocr_result(tenant_id, doc_id)
            if not ocr_results:
                raise HTTPException(status_code=404, detail="OCR results not found")
@@ -196,26 +220,32 @@ async def get_ocr_results(
            raise
        except Exception as e:
            logger.error("Failed to get OCR results", doc_id=doc_id, error=str(e))
-            raise HTTPException(status_code=500, detail="Failed to get OCR results")
+            raise HTTPException(
                status_code=500, detail="Failed to get OCR results"
            ) from e
 async def _handle_document_ingested(topic: str, payload: EventPayload) -> None:
    """Handle document ingestion events"""
-    try:
+    data = payload.data
-        data = payload.data
+    doc_id = data.get("doc_id")
-        doc_id = data.get("doc_id")
+    tenant_id = data.get("tenant_id")
        tenant_id = data.get("tenant_id")
-        if not doc_id or not tenant_id:
+    if not doc_id or not tenant_id:
-            logger.warning("Invalid document ingestion event", data=data)
+        logger.warning("Invalid document ingestion event", data=data)
-            return
+        return
    ds = document_storage
    if ds is None:
        logger.error("Document storage not initialized")
        return
-        # Auto-process PDF documents
+    # Auto-process PDF documents
-        if data.get("content_type") == "application/pdf":
+    if data.get("content_type") == "application/pdf":
-            logger.info("Auto-processing ingested document", doc_id=doc_id)
+        logger.info("Auto-processing ingested document", doc_id=doc_id)
        try:
            # Get document content
-            doc_content = await document_storage.get_document(tenant_id, doc_id)
+            doc_content = await ds.get_document(tenant_id, doc_id)
            if doc_content:
                await _process_document_async(
                    doc_id=doc_id,
@@ -225,9 +255,10 @@ async def _handle_document_ingested(topic: str, payload: EventPayload) -> None:
                    processing_id=str(ulid.new()),
                    actor=payload.actor,
                )
-
+        except Exception as e:
-    except Exception as e:
+            logger.error(
-        logger.error("Failed to handle document ingestion", error=str(e))
+                "Failed to handle document ingestion", doc_id=doc_id, error=str(e)
            )
 async def _process_document_async(
@@ -250,8 +281,8 @@ async def _process_document_async(
            images = await _pdf_to_images(content)
            # Process each page
-            pages_data: list[Any] = []
+            pages_data: list[dict[str, Any]] = []
-            for page_num, image in enumerate(images, 1):
+            for page_num, image in enumerate(images, 0):
                page_data = await _process_page(image, page_num, strategy)
                pages_data.append(page_data)
@@ -270,7 +301,10 @@ async def _process_document_async(
            }
            # Store results
-            await document_storage.store_ocr_result(tenant_id, doc_id, ocr_results)
+            ds = document_storage
            if ds is None:
                raise RuntimeError("Document storage not initialized")
            await ds.store_ocr_result(tenant_id, doc_id, ocr_results)
            # Update metrics
            metrics.counter("documents_processed_total").labels(
@@ -282,7 +316,7 @@ async def _process_document_async(
            ).observe(
                datetime.utcnow().timestamp()
                - datetime.fromisoformat(
-                    ocr_results["processed_at"].replace("Z", "")
+                    ocr_results["processed_at"].replace("Z", "")  # type: ignore
                ).timestamp()
            )
@@ -300,7 +334,9 @@ async def _process_document_async(
                tenant_id=tenant_id,
            )
-            await event_bus.publish(EventTopics.DOC_OCR_READY, event_payload)
+            eb = event_bus
            if eb is not None:
                await eb.publish(EventTopics.DOC_OCR_READY, event_payload)
            logger.info(
                "OCR processing completed", doc_id=doc_id, pages=len(pages_data)
@@ -316,58 +352,91 @@ async def _process_document_async(
 async def _pdf_to_images(pdf_content: bytes) -> list[bytes]:
-    """Convert PDF to images"""
+    """Convert PDF to page images without PyMuPDF.
    Primary: pdf2image (requires poppler). Fallback: extract largest embedded image per page via PyPDF2/Pillow.
    """
    # First try pdf2image for full-page rasterization
    try:
        import fitz  # PyMuPDF
        # Open PDF
        pdf_doc = fitz.open(stream=pdf_content, filetype="pdf")
        images: list[Any] = []
        for page_num in range(min(len(pdf_doc), settings.max_pages)):
            page = pdf_doc[page_num]
            # Render page to image
            mat = fitz.Matrix(2.0, 2.0)  # 2x zoom for better OCR
            pix = page.get_pixmap(matrix=mat)
            img_data = pix.tobytes("png")
            images.append(img_data)
        pdf_doc.close()
        return images
    except ImportError:
        logger.error("PyMuPDF not available, using fallback")
        return await _pdf_to_images_fallback(pdf_content)
    except Exception as e:
        logger.error("PDF conversion failed", error=str(e))
        raise
 async def _pdf_to_images_fallback(pdf_content: bytes) -> list[bytes]:
    """Fallback PDF to images conversion"""
    try:
        from pdf2image import convert_from_bytes
        images = convert_from_bytes(
            pdf_content, dpi=200, first_page=1, last_page=settings.max_pages
        )
-
+        image_bytes: list[bytes] = []
        # Convert PIL images to bytes
        image_bytes: list[Any] = []
        for img in images:
            import io
            img_buffer = io.BytesIO()
            img.save(img_buffer, format="PNG")
            image_bytes.append(img_buffer.getvalue())
        return image_bytes
    except Exception as e:
        logger.warning(
            "pdf2image conversion failed; falling back to PyPDF2", error=str(e)
        )
-    except ImportError:
+    # Fallback: extract largest embedded image per page using PyPDF2
-        logger.error("pdf2image not available")
+    try:
-        raise Exception("No PDF conversion library available")
+        reader = PdfReader(io.BytesIO(pdf_content))
        out_images: list[bytes] = []
        for page_index, page in enumerate(reader.pages):
            if page_index >= settings.max_pages:
                break
            try:
                resources = page.get("/Resources")
                if resources is None:
                    continue
                xobject = resources.get("/XObject")
                if xobject is None:
                    continue
                xobject = xobject.get_object()
                largest = None
                largest_area = -1
                for _, obj_ref in xobject.items():
                    try:
                        obj = obj_ref.get_object()
                        if obj.get("/Subtype") != "/Image":
                            continue
                        width = int(obj.get("/Width", 0))
                        height = int(obj.get("/Height", 0))
                        area = width * height
                        if area > largest_area:
                            largest = obj
                            largest_area = area
                    except Exception:
                        continue
                if largest is None:
                    continue
                data = largest.get_data()
                filt = largest.get("/Filter")
                if filt in ("/DCTDecode", "/JPXDecode"):
                    # JPEG or JPEG2000
                    out_images.append(data)
                else:
                    # Flate or other; decode via Pillow
                    mode = "RGB"
                    colorspace = largest.get("/ColorSpace")
                    if colorspace in ("/DeviceGray",):
                        mode = "L"
                    width = int(largest.get("/Width", 0))
                    height = int(largest.get("/Height", 0))
                    try:
                        img = Image.frombytes(mode, (width, height), data)
                    except Exception:
                        img = Image.open(io.BytesIO(data))
                    buf = io.BytesIO()
                    img.save(buf, format="PNG")
                    out_images.append(buf.getvalue())
            except Exception:
                continue
        if not out_images:
            raise RuntimeError("No images extracted via PyPDF2 fallback")
        return out_images
    except Exception as fallback_e:
        logger.error("PDF conversion failed (both methods)", error=str(fallback_e))
        raise
 async def _process_page(
@@ -395,6 +464,8 @@ async def _process_page(
                layoutlm_result.get("confidence", 0),
            ),
        }
    elif strategy == "vision":
        return await _process_with_vision(image_data, page_num)
    else:
        raise ValueError(f"Unknown strategy: {strategy}")
@@ -402,11 +473,6 @@ async def _process_page(
 async def _process_with_tesseract(image_data: bytes, page_num: int) -> dict[str, Any]:
    """Process page with Tesseract OCR"""
    try:
        import io
        import pytesseract
        from PIL import Image
        # Load image
        image = Image.open(io.BytesIO(image_data))
@@ -414,13 +480,13 @@ async def _process_with_tesseract(image_data: bytes, page_num: int) -> dict[str,
        config = f"{settings.tesseract_config} -l {settings.languages}"
        # Extract text with confidence
-        data = pytesseract.image_to_data(
+        data = pytesseract.image_to_data(  # type: ignore
            image, config=config, output_type=pytesseract.Output.DICT
        )
        # Process results
-        words: list[Any] = []
+        words: list[dict[str, Any]] = []
-        confidences: list[Any] = []
+        confidences: list[float] = []
        for i in range(len(data["text"])):
            if int(data["conf"][i]) > 0:  # Valid confidence
@@ -449,13 +515,6 @@ async def _process_with_tesseract(image_data: bytes, page_num: int) -> dict[str,
            "word_count": len(words),
        }
    except ImportError:
        logger.error("pytesseract not available")
        return {
            "page": page_num,
            "strategy": "tesseract",
            "error": "pytesseract not available",
        }
    except Exception as e:
        logger.error("Tesseract processing failed", page=page_num, error=str(e))
        return {"page": page_num, "strategy": "tesseract", "error": str(e)}
@@ -482,6 +541,68 @@ async def _process_with_layoutlm(image_data: bytes, page_num: int) -> dict[str,
        return {"page": page_num, "strategy": "layoutlm", "error": str(e)}
 async def _process_with_vision(image_data: bytes, page_num: int) -> dict[str, Any]:
    """Process page with LLM vision OCR via shared OCRProcessor"""
    try:
        vp = vision_processor
        if vp is None:
            raise RuntimeError("Vision OCR processor not initialized")
        # Persist the page image temporarily for the processor API
        import tempfile
        with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
            tmp.write(image_data)
            tmp_path = tmp.name
        try:
            text = vp.process_image(
                image_path=tmp_path,
                format_type=settings.vision_format,
                preprocess=settings.vision_preprocess,
                language=settings.languages,
            )
        finally:
            try:
                os.remove(tmp_path)
            except OSError:
                pass
        return {
            "page": page_num,
            "strategy": "vision",
            "text": text if isinstance(text, str) else str(text),
            "confidence": 0.0,  # Not provided by LLM API
        }
    except Exception as e:
        logger.error("Vision processing failed", page=page_num, error=str(e))
        return {"page": page_num, "strategy": "vision", "error": str(e)}
 def _schedule_process_document_async(
    doc_id: str,
    tenant_id: str,
    content: bytes,
    strategy: str,
    processing_id: str,
    actor: str,
 ) -> None:
    """Sync wrapper to schedule the async OCR task.
    This keeps FastAPI BackgroundTasks type expectations satisfied under mypy strict.
    """
    asyncio.create_task(
        _process_document_async(
            doc_id=doc_id,
            tenant_id=tenant_id,
            content=content,
            strategy=strategy,
            processing_id=processing_id,
            actor=actor,
        )
    )
@app.exception_handler(HTTPException)
 async def http_exception_handler(request: Request, exc: HTTPException) -> JSONResponse:
    """Handle HTTP exceptions with RFC7807 format"""
--- a/apps/svc_ocr/requirements.txt
+++ b/apps/svc_ocr/requirements.txt
@@ -5,7 +5,7 @@
 pytesseract>=0.3.13
 # PDF processing
-PyMuPDF>=1.26.4
+PyPDF2>=3.0.1
 pdf2image>=1.17.0
 # Image processing
--- a/apps/svc_rag_indexer/requirements.txt
+++ b/apps/svc_rag_indexer/requirements.txt
@@ -2,7 +2,7 @@
 # NOTE: ML dependencies (sentence-transformers, transformers, torch, numpy) are in base-ml image
 # Text chunking (lightweight alternative to langchain)
-tiktoken>=0.11.0
+tiktoken>=0.12.0
 # Text preprocessing (lightweight)
 beautifulsoup4>=4.14.2
--- a/apps/svc_reason/requirements.txt
+++ b/apps/svc_reason/requirements.txt
@@ -1,20 +1,20 @@
 # FastAPI and server
-fastapi>=0.104.1
+fastapi>=0.118.3
-uvicorn[standard]>=0.24.0
+uvicorn[standard]>=0.37.0
-pydantic>=2.5.0
+pydantic>=2.12.0
 # Service-specific dependencies
 # Mathematical calculations
 # decimal is part of Python standard library
-sympy>=1.12.0
+sympy>=1.14.0
 # Tax calculations
 numpy>=2.3.3
-pandas>=2.1.0
+pandas>=2.3.3
 # Date and time calculations
-python-dateutil>=2.8.0
+python-dateutil>=2.9.0
-pytz>=2023.3
+pytz>=2025.2
 # UK tax specific
 # uk-tax-calculator>=1.0.0  # Package may not exist, commenting out
@@ -26,10 +26,10 @@ pytz>=2023.3
 # quantlib>=1.32.0  # Package may not exist, commenting out
 # Data validation
-cerberus>=1.3.4
+cerberus>=1.3.7
 # Template processing for explanations
-jinja2>=3.1.0
+jinja2>=3.1.6
 # Statistical calculations
-scipy>=1.11.0
+scipy>=1.16.2
--- a/apps/svc_rpa/requirements.txt
+++ b/apps/svc_rpa/requirements.txt
@@ -1,11 +1,11 @@
 # FastAPI and server
-fastapi>=0.104.1
+fastapi>=0.118.3
-uvicorn[standard]>=0.24.0
+uvicorn[standard]>=0.37.0
-pydantic>=2.5.0
+pydantic>=2.12.0
 # Service-specific dependencies
 # Browser automation
-playwright>=1.40.0
+playwright>=1.55.0
 # Additional async utilities
 # asyncio-timeout>=4.0.3  # Deprecated, use asyncio.timeout from Python 3.11+ standard library
@@ -14,4 +14,4 @@ playwright>=1.40.0
 aioredis>=2.0.1
 # Browser management
-psutil>=5.9.0
+psutil>=7.1.0
--- a/docs/DEPLOYMENT_PLAN.md
+++ b/docs/DEPLOYMENT_PLAN.md
@@ -7,6 +7,7 @@ This plan outlines the strategy to host both the **AI Tax Agent application** an
 ## Current State Analysis
 ### Remote Server (`141.136.35.199`)
 - **Location**: `/opt/compose/`
 - **Existing Services**:
  - Traefik v3.5.1 (reverse proxy with GoDaddy DNS challenge)
@@ -25,6 +26,7 @@ This plan outlines the strategy to host both the **AI Tax Agent application** an
  - `portainer.harkon.co.uk`
 ### Local Repository (`infra/compose/`)
 - **Compose Files**:
  - `docker-compose.local.yml` - Full stack for local development
  - `docker-compose.backend.yml` - Backend services (appears to be production-ready)
@@ -39,25 +41,30 @@ This plan outlines the strategy to host both the **AI Tax Agent application** an
 ## Challenges & Conflicts
 ### 1. **Duplicate Services**
 - Both environments have Traefik and Authentik
 - Need to decide: shared vs. isolated
 ### 2. **Network Naming**
 - Remote: `frontend`, `backend`
 - Local: `ai-tax-agent-frontend`, `ai-tax-agent-backend`
 - Production needs: Consistent naming
 ### 3. **Domain Management**
 - Remote: `*.harkon.co.uk` (public)
 - Local: `*.local.lan` (development)
 - Production: Need subdomains like `app.harkon.co.uk`, `api.harkon.co.uk`
 ### 4. **SSL Certificates**
 - Remote: GoDaddy DNS challenge (production)
 - Local: Self-signed certificates
 - Production: Must use GoDaddy DNS challenge
 ### 5. **Resource Isolation**
 - Company services need to remain stable
 - Application services need independent deployment/rollback
@@ -66,6 +73,7 @@ This plan outlines the strategy to host both the **AI Tax Agent application** an
 We will deploy the company services and the AI Tax Agent as two fully isolated stacks, each with its own Traefik and Authentik. This maximizes blast-radius isolation and avoids naming and DNS conflicts across environments.
 Key implications:
 - Separate external networks and DNS namespaces per stack
 - Duplicate edge (Traefik) and IdP (Authentik), independent upgrades and rollbacks
 - Slightly higher resource usage in exchange for strong isolation
@@ -139,6 +147,7 @@ Key implications:
 ### Domain Mapping
 **Company Services** (existing):
 - `traefik.harkon.co.uk` - Traefik dashboard
 - `auth.harkon.co.uk` - Authentik SSO
 - `gitea.harkon.co.uk` - Git hosting
@@ -146,6 +155,7 @@ Key implications:
 - `portainer.harkon.co.uk` - Docker management
 **Application Services** (app stack):
 - `review.<domain>` - Review UI
 - `api.<domain>` - API Gateway (microservices via Traefik)
 - `vault.<domain>` - Vault UI (admin only)
@@ -159,12 +169,14 @@ Key implications:
 ### Authentication Strategy
 **Authentik Configuration**:
 1. **Company Group** - Access to Gitea, Nextcloud, Portainer
 2. **App Admin Group** - Full access to all app services
 3. **App User Group** - Access to Review UI and API
 4. **App Reviewer Group** - Access to Review UI only
 **Middleware Configuration**:
 - `authentik-forwardauth` - Standard auth for all services
 - `admin-auth` - Requires admin group (Vault, MinIO, Neo4j, etc.)
 - `reviewer-auth` - Requires reviewer or higher
@@ -182,6 +194,7 @@ Key implications:
 ### Development Environment
 **Keep Existing Setup**:
 - Use `docker-compose.local.yml` as-is
 - Domain: `*.local.lan`
 - Self-signed certificates
@@ -189,6 +202,7 @@ Key implications:
 - Full stack runs locally
 **Benefits**:
 - No dependency on remote server
 - Fast iteration
 - Complete isolation
@@ -217,19 +231,22 @@ make deploy-production  # Deploy to remote server
 ### Phase 1: Preparation (Week 1)
 1. **Backup Current State**
   ```bash
   ssh deploy@141.136.35.199
-   cd /opt/compose
+   cd /opt
   tar -czf ~/backup-$(date +%Y%m%d).tar.gz .
   ```
 2. **Create Production Environment File**
-   - Copy `infra/compose/env.example` to `infra/compose/.env.production`
+
   - Copy `infra/environments/production/.env.example` to `infra/environments/production/.env`
   - Update all secrets and passwords
   - Set `DOMAIN=harkon.co.uk`
   - Configure GoDaddy API credentials
 3. **Update Traefik Configuration**
   - Merge local Traefik config with remote
   - Add application routes
   - Configure Authentik ForwardAuth
@@ -242,13 +259,15 @@ make deploy-production  # Deploy to remote server
 ### Phase 2: Infrastructure Deployment (Week 2)
 1. **Deploy Application Infrastructure**
   ```bash
   # On remote server
-   cd /opt/compose/ai-tax-agent
+   cd /opt/ai-tax-agent
   docker compose -f infrastructure.yaml up -d
   ```
 2. **Initialize Services**
   - Vault: Unseal and configure
   - Postgres: Run migrations
   - Neo4j: Install plugins
@@ -262,11 +281,13 @@ make deploy-production  # Deploy to remote server
 ### Phase 3: Application Deployment (Week 3)
 1. **Deploy Microservices**
   ```bash
   docker compose -f services.yaml up -d
   ```
 2. **Deploy Monitoring**
   ```bash
   docker compose -f monitoring.yaml up -d
   ```
--- a/docs/DEPLOYMENT_PROGRESS.md
+++ b/docs/DEPLOYMENT_PROGRESS.md
@@ -10,7 +10,7 @@
 ### 1. Production Compose Files Created
-Created three production-ready Docker Compose files in `infra/compose/production/`:
+Created three production-ready Docker Compose files in `infra/base/`:
 #### **infrastructure.yaml**
 - Vault (secrets management)
@@ -104,7 +104,7 @@ chmod +x scripts/deploy-to-production.sh
 ### 3. Documentation Created
-#### **infra/compose/production/README.md**
+#### **infra/base manifests**
 Comprehensive production deployment guide including:
 - Prerequisites checklist
 - Three deployment options (automated, step-by-step, manual)
@@ -221,7 +221,7 @@ Or step-by-step:
 1. **Initialize Vault**
   ```bash
   ssh deploy@141.136.35.199
-   cd /opt/compose/ai-tax-agent
+   cd /opt/ai-tax-agent
   docker exec -it vault vault operator init
   # Save unseal keys!
   docker exec -it vault vault operator unseal
@@ -382,7 +382,6 @@ Deployment is successful when:
 If you encounter issues:
 1. Check logs: `./scripts/deploy-to-production.sh logs <service>`
 2. Verify status: `./scripts/deploy-to-production.sh verify`
-3. Review documentation: `infra/compose/production/README.md`
+3. Review manifests: `infra/base/*.yaml`
 4. Check deployment plan: `docs/DEPLOYMENT_PLAN.md`
 5. Follow checklist: `docs/DEPLOYMENT_CHECKLIST.md`
--- a/docs/DEPLOYMENT_STATUS.md
+++ b/docs/DEPLOYMENT_STATUS.md
@@ -21,15 +21,14 @@
 - ✅ Created quick start guide (`docs/QUICK_START.md`)
 ### 3. Production Configuration Files
- ✅ Created `infra/compose/production/infrastructure.yaml` (7 infrastructure services)
+- ✅ Created `infra/base/infrastructure.yaml` (infrastructure, incl. Traefik + Authentik)
- ✅ Created `infra/compose/production/services.yaml` (14 application services + UI)
+- ✅ Created `infra/base/services.yaml` (application services + UI)
- ✅ Created `infra/compose/production/monitoring.yaml` (Prometheus, Grafana, Loki, Promtail)
+- ✅ Created `infra/base/monitoring.yaml` (Prometheus, Grafana, Loki, Promtail)
 - ✅ Created `infra/compose/production/README.md` (deployment guide)
 ### 4. Monitoring Configuration
- ✅ Created Prometheus configuration (`infra/compose/prometheus/prometheus.yml`)
+- ✅ Created Prometheus configuration (`infra/base/prometheus/prometheus.yml`)
- ✅ Created Loki configuration (`infra/compose/loki/loki-config.yml`)
+- ✅ Created Loki configuration (`infra/base/loki/loki-config.yml`)
- ✅ Created Promtail configuration (`infra/compose/promtail/promtail-config.yml`)
+- ✅ Created Promtail configuration (`infra/base/promtail/promtail-config.yml`)
 - ✅ Configured service discovery for all 14 services
 - ✅ Set up 30-day metrics retention
@@ -266,10 +265,9 @@ df -h
   - `docs/ENVIRONMENT_COMPARISON.md` - Local vs Production comparison
 2. **Configuration:**
-   - `infra/compose/production/README.md` - Production compose guide
+   - `infra/base/infrastructure.yaml` - Infrastructure services
-   - `infra/compose/production/infrastructure.yaml` - Infrastructure services
+   - `infra/base/services.yaml` - Application services
-   - `infra/compose/production/services.yaml` - Application services
+   - `infra/base/monitoring.yaml` - Monitoring stack
   - `infra/compose/production/monitoring.yaml` - Monitoring stack
 3. **Deployment:**
   - `docs/POST_BUILD_DEPLOYMENT.md` - Post-build deployment steps
@@ -319,4 +317,3 @@ For questions or issues:
 - 🟡 In Progress
 - ⏳ Pending
 - ❌ Blocked
--- a/docs/ENVIRONMENT_COMPARISON.md
+++ b/docs/ENVIRONMENT_COMPARISON.md
@@ -12,7 +12,7 @@ This document compares the local development environment with the production env
 | **SSL** | Self-signed certificates | Let's Encrypt (GoDaddy DNS) |
 | **Networks** | `ai-tax-agent-frontend`<br/>`ai-tax-agent-backend` | `frontend`<br/>`backend` |
 | **Compose File** | `docker-compose.local.yml` | `infrastructure.yaml`<br/>`services.yaml`<br/>`monitoring.yaml` |
-| **Location** | Local machine | `deploy@141.136.35.199:/opt/compose/ai-tax-agent/` |
+| **Location** | Local machine | `deploy@141.136.35.199:/opt/ai-tax-agent/` |
 | **Traefik** | Isolated instance | Shared with company services |
 | **Authentik** | Isolated instance | Shared with company services |
 | **Data Persistence** | Local Docker volumes | Remote Docker volumes + backups |
@@ -271,7 +271,7 @@ make clean
 #### Production
 ```bash
 # Deploy infrastructure
-cd /opt/compose/ai-tax-agent
+cd /opt/ai-tax-agent
 docker compose -f infrastructure.yaml up -d
 # Deploy services
@@ -370,7 +370,7 @@ docker compose -f services.yaml up -d --no-deps svc-ingestion
 4. **Deploy to production**:
   ```bash
   ssh deploy@141.136.35.199
-   cd /opt/compose/ai-tax-agent
+   cd /opt/ai-tax-agent
   docker compose -f services.yaml pull
   docker compose -f services.yaml up -d
   ```
@@ -436,4 +436,3 @@ The key differences between local and production environments are:
 6. **Backups**: Local has none; production has automated backups
 Both environments use the same application code and Docker images, ensuring consistency and reducing deployment risks.
--- a/docs/GITEA_REGISTRY_DEBUG.md
+++ b/docs/GITEA_REGISTRY_DEBUG.md
@@ -1,332 +0,0 @@
 # Gitea Container Registry Debugging Guide
 ## Common Issues When Pushing Large Docker Images
 ### Issue 1: Not Logged In
 **Symptom**: `unauthorized: authentication required`
 **Solution**:
 ```bash
 # On remote server
 docker login gitea.harkon.co.uk
 # Username: blue (or your Gitea username)
 # Password: <your-gitea-access-token>
 ```
 ---
 ### Issue 2: Upload Size Limit (413 Request Entity Too Large)
 **Symptom**: Push fails with `413 Request Entity Too Large` or similar error
 **Root Cause**: Traefik or Gitea has a limit on request body size
 **Solution A: Configure Traefik Middleware**
 1. Find your Traefik configuration directory:
 ```bash
 docker inspect traefik | grep -A 10 Mounts
 ```
 2. Create middleware configuration:
 ```bash
 # Example: /opt/traefik/config/middlewares.yml
 sudo tee /opt/traefik/config/middlewares.yml > /dev/null << 'EOF'
 http:
  middlewares:
    large-upload:
      buffering:
        maxRequestBodyBytes: 5368709120  # 5GB
        memRequestBodyBytes: 104857600   # 100MB
        maxResponseBodyBytes: 5368709120 # 5GB
        memResponseBodyBytes: 104857600  # 100MB
 EOF
 ```
 3. Update Gitea container labels:
 ```yaml
 labels:
  - "traefik.http.routers.gitea.middlewares=large-upload@file"
 ```
 4. Restart Traefik:
 ```bash
 docker restart traefik
 ```
 **Solution B: Configure Gitea Directly**
 1. Edit Gitea configuration:
 ```bash
 docker exec -it gitea-server vi /data/gitea/conf/app.ini
 ```
 2. Add/modify these settings:
 ```ini
 [server]
 LFS_MAX_FILE_SIZE = 5368709120  ; 5GB
 [repository.upload]
 FILE_MAX_SIZE = 5368709120  ; 5GB
 ```
 3. Restart Gitea:
 ```bash
 docker restart gitea-server
 ```
 ---
 ### Issue 3: Network Timeout
 **Symptom**: Push hangs or times out after uploading for a while
 **Root Cause**: Network instability or slow connection
 **Solution**: Use chunked uploads or increase timeout
 1. Configure Docker daemon timeout:
 ```bash
 # Edit /etc/docker/daemon.json
 sudo tee /etc/docker/daemon.json > /dev/null << 'EOF'
 {
  "max-concurrent-uploads": 1,
  "max-concurrent-downloads": 3,
  "registry-mirrors": []
 }
 EOF
 sudo systemctl restart docker
 ```
 2. Or use Traefik timeout middleware:
 ```yaml
 http:
  middlewares:
    long-timeout:
      buffering:
        retryExpression: "IsNetworkError() && Attempts() < 3"
 ```
 ---
 ### Issue 4: Disk Space
 **Symptom**: Push fails with "no space left on device"
 **Solution**:
 ```bash
 # Check disk space
 df -h
 # Clean up Docker
 docker system prune -a --volumes -f
 # Check again
 df -h
 ```
 ---
 ### Issue 5: Gitea Registry Not Enabled
 **Symptom**: `404 Not Found` when accessing `/v2/`
 **Solution**:
 ```bash
 # Check if registry is enabled
 docker exec gitea-server cat /data/gitea/conf/app.ini | grep -A 5 "\[packages\]"
 # Should show:
 # [packages]
 # ENABLED = true
 ```
 If not enabled, add to `app.ini`:
 ```ini
 [packages]
 ENABLED = true
 ```
 Restart Gitea:
 ```bash
 docker restart gitea-server
 ```
 ---
 ## Debugging Steps
 ### Step 1: Verify Gitea Registry is Accessible
 ```bash
 # Should return 401 Unauthorized (which is good - means registry is working)
 curl -I https://gitea.harkon.co.uk/v2/
 # Should return 200 OK after login
 docker login gitea.harkon.co.uk
 curl -u "username:token" https://gitea.harkon.co.uk/v2/
 ```
 ### Step 2: Test with Small Image
 ```bash
 # Pull a small image
 docker pull alpine:latest
 # Tag it for your registry
 docker tag alpine:latest gitea.harkon.co.uk/harkon/test:latest
 # Try to push
 docker push gitea.harkon.co.uk/harkon/test:latest
 ```
 If this works, the issue is with large images (size limit).
 ### Step 3: Check Gitea Logs
 ```bash
 # Check for errors
 docker logs gitea-server --tail 100 | grep -i error
 # Watch logs in real-time while pushing
 docker logs -f gitea-server
 ```
 ### Step 4: Check Traefik Logs
 ```bash
 # Check for 413 or 502 errors
 docker logs traefik --tail 100 | grep -E "413|502|error"
 # Watch logs in real-time
 docker logs -f traefik
 ```
 ### Step 5: Check Docker Daemon Logs
 ```bash
 # Check Docker daemon logs
 sudo journalctl -u docker --since "1 hour ago" | grep -i error
 ```
 ---
 ## Quick Fix: Bypass Traefik for Registry
 If Traefik is causing issues, you can expose Gitea's registry directly:
 1. Update Gitea docker-compose to expose port 3000:
 ```yaml
 services:
  gitea:
    ports:
      - "3000:3000"  # HTTP
 ```
 2. Use direct connection:
 ```bash
 docker login gitea.harkon.co.uk:3000
 docker push gitea.harkon.co.uk:3000/harkon/base-ml:v1.0.1
 ```
 **Note**: This bypasses SSL, so only use for debugging!
 ---
 ## Recommended Configuration for Large Images
 ### Traefik Configuration
 Create `/opt/traefik/config/gitea-registry.yml`:
 ```yaml
 http:
  middlewares:
    gitea-registry:
      buffering:
        maxRequestBodyBytes: 5368709120   # 5GB
        memRequestBodyBytes: 104857600    # 100MB in memory
        maxResponseBodyBytes: 5368709120  # 5GB
        memResponseBodyBytes: 104857600   # 100MB in memory
  routers:
    gitea-registry:
      rule: "Host(`gitea.harkon.co.uk`) && PathPrefix(`/v2/`)"
      entryPoints:
        - websecure
      middlewares:
        - gitea-registry
      service: gitea
      tls:
        certResolver: letsencrypt
 ```
 ### Gitea Configuration
 In `/data/gitea/conf/app.ini`:
 ```ini
 [server]
 PROTOCOL = http
 DOMAIN = gitea.harkon.co.uk
 ROOT_URL = https://gitea.harkon.co.uk/
 HTTP_PORT = 3000
 LFS_MAX_FILE_SIZE = 5368709120
 [repository.upload]
 FILE_MAX_SIZE = 5368709120
 ENABLED = true
 [packages]
 ENABLED = true
 CHUNKED_UPLOAD_PATH = /data/gitea/tmp/package-upload
 ```
 ---
 ## Testing the Fix
 After applying configuration changes:
 1. Restart services:
 ```bash
 docker restart traefik
 docker restart gitea-server
 ```
 2. Test with a large layer:
 ```bash
 # Build base-ml (has large layers)
 cd /home/deploy/ai-tax-agent
 docker build -f infra/docker/base-ml.Dockerfile -t gitea.harkon.co.uk/harkon/base-ml:test .
 # Try to push
 docker push gitea.harkon.co.uk/harkon/base-ml:test
 ```
 3. Monitor logs:
 ```bash
 # Terminal 1: Watch Traefik
 docker logs -f traefik
 # Terminal 2: Watch Gitea
 docker logs -f gitea-server
 # Terminal 3: Push image
 docker push gitea.harkon.co.uk/harkon/base-ml:test
 ```
 ---
 ## Alternative: Use Docker Hub or GitHub Container Registry
 If Gitea continues to have issues with large images, consider:
 1. **Docker Hub**: Free for public images
 2. **GitHub Container Registry (ghcr.io)**: Free for public/private
 3. **GitLab Container Registry**: Free tier available
 These are battle-tested for large ML images and have better defaults for large uploads.
--- a/docs/GITEA_REGISTRY_FIX.md
+++ b/docs/GITEA_REGISTRY_FIX.md
@@ -1,194 +0,0 @@
 # Gitea Container Registry - Image Naming Fix
 ## Issue
 The initial build script was using incorrect image naming convention for Gitea's container registry.
 ### Incorrect Format
 ```
 gitea.harkon.co.uk/ai-tax-agent/svc-ingestion:v1.0.0
 ```
 ### Correct Format (Per Gitea Documentation)
 ```
 gitea.harkon.co.uk/{owner}/{image}:{tag}
 ```
 Where `{owner}` must be your **Gitea username** or **organization name**.
 **Using organization:** `harkon` (Gitea team/organization)
 ## Solution
 Updated the build script and production compose files to use the correct naming convention.
 ### Changes Made
 #### 1. Build Script (`scripts/build-and-push-images.sh`)
 **Before:**
 ```bash
 REGISTRY="${1:-gitea.harkon.co.uk}"
 VERSION="${2:-latest}"
 PROJECT="ai-tax-agent"
 IMAGE_NAME="$REGISTRY/$PROJECT/$service:$VERSION"
 ```
 **After:**
 ```bash
 REGISTRY="${1:-gitea.harkon.co.uk}"
 VERSION="${2:-latest}"
 OWNER="${3:-harkon}"  # Gitea organization/team name
 IMAGE_NAME="$REGISTRY/$OWNER/$service:$VERSION"
 ```
 #### 2. Production Services (`infra/compose/production/services.yaml`)
 **Before:**
 ```yaml
 svc-ingestion:
  image: gitea.harkon.co.uk/ai-tax-agent/svc-ingestion:latest
 ```
 **After:**
 ```yaml
 svc-ingestion:
  image: gitea.harkon.co.uk/harkon/svc-ingestion:latest
 ```
 All 14 services updated:
 - svc-ingestion
 - svc-extract
 - svc-kg
 - svc-rag-retriever
 - svc-rag-indexer
 - svc-forms
 - svc-hmrc
 - svc-ocr
 - svc-rpa
 - svc-normalize-map
 - svc-reason
 - svc-firm-connectors
 - svc-coverage
 - ui-review
 ## Usage
 ### Build and Push Images
 ```bash
 # With default owner (harkon organization)
 ./scripts/build-and-push-images.sh gitea.harkon.co.uk v1.0.1
 # With custom owner
 ./scripts/build-and-push-images.sh gitea.harkon.co.uk v1.0.1 <your-gitea-org>
 ```
 ### Pull Images
 ```bash
 docker pull gitea.harkon.co.uk/harkon/svc-ingestion:v1.0.1
 ```
 ### Push Images Manually
 ```bash
 # Tag image
 docker tag my-image:latest gitea.harkon.co.uk/harkon/my-image:v1.0.1
 # Push image
 docker push gitea.harkon.co.uk/harkon/my-image:v1.0.1
 ```
 ## Gitea Registry Documentation Reference
 From Gitea's official documentation:
 ### Image Naming Convention
 Images must follow this naming convention:
 ```
 {registry}/{owner}/{image}
 ```
 When building your docker image, using the naming convention above, this looks like:
 ```bash
 # build an image with tag
 docker build -t {registry}/{owner}/{image}:{tag} .
 # name an existing image with tag
 docker tag {some-existing-image}:{tag} {registry}/{owner}/{image}:{tag}
 ```
 ### Valid Examples
 For owner `testuser` on `gitea.example.com`:
 - ✅ `gitea.example.com/testuser/myimage`
 - ✅ `gitea.example.com/testuser/my-image`
 - ✅ `gitea.example.com/testuser/my/image`
 ### Important Notes
 1. **Owner must exist**: The owner (username or organization) must exist in Gitea
 2. **Case-insensitive tags**: `image:tag` and `image:Tag` are treated as the same
 3. **Authentication required**: Use personal access token with `write:package` scope
 4. **Registry URL**: Use the main Gitea domain, not a separate registry subdomain
 ## Verification
 After the fix, verify images are pushed correctly:
 ```bash
 # Login to Gitea
 docker login gitea.harkon.co.uk
 # Check pushed images in Gitea UI
 # Navigate to: https://gitea.harkon.co.uk/blue/-/packages
 ```
 ## Current Build Status
 ✅ **Fixed and working!**
 Build command:
 ```bash
 ./scripts/build-and-push-images.sh gitea.harkon.co.uk v1.0.1 harkon
 ```
 Expected output:
 ```
 ℹ️  Logging in to registry: gitea.harkon.co.uk
 Login Succeeded
 ℹ️  Building svc-ingestion...
 ℹ️  Building: gitea.harkon.co.uk/harkon/svc-ingestion:v1.0.1
 ✅ Built: gitea.harkon.co.uk/harkon/svc-ingestion:v1.0.1
 ℹ️  Pushing: gitea.harkon.co.uk/harkon/svc-ingestion:v1.0.1
 ✅ Pushed: gitea.harkon.co.uk/harkon/svc-ingestion:v1.0.1
 ```
 ## Next Steps
 1. ✅ Build script fixed
 2. ✅ Production compose files updated
 3. 🟡 Build in progress (14 services)
 4. ⏳ Deploy to production (after build completes)
 ## References
 - [Gitea Container Registry Documentation](https://docs.gitea.com/usage/packages/container)
 - Build script: `scripts/build-and-push-images.sh`
 - Production services: `infra/compose/production/services.yaml`
--- a/docs/OPTIMIZATION_SUMMARY.md
+++ b/docs/OPTIMIZATION_SUMMARY.md
@@ -148,11 +148,11 @@ docker run --rm gitea.harkon.co.uk/harkon/svc-ocr:v1.0.1 pip list | grep torch
 ### 5. Update Production Deployment
-Update `infra/compose/production/services.yaml` to use `v1.0.1`:
+Update `infra/base/services.yaml` to use `v1.0.1`:
 ```bash
 # Find and replace v1.0.0 with v1.0.1
-sed -i '' 's/:v1.0.0/:v1.0.1/g' infra/compose/production/services.yaml
+sed -i '' 's/:v1.0.0/:v1.0.1/g' infra/base/services.yaml
 # Or use latest tag (already configured)
 # No changes needed if using :latest
--- a/docs/QUICK_START.md
+++ b/docs/QUICK_START.md
@@ -50,7 +50,7 @@ docker login gitea.harkon.co.uk
 **SSH to server:**
 ```bash
 ssh deploy@141.136.35.199
-cd /opt/compose/ai-tax-agent
+cd /opt/ai-tax-agent
 ```
 **Initialize Vault:**
@@ -62,19 +62,19 @@ docker exec -it vault vault operator unseal
 **Create MinIO Buckets:**
 ```bash
-docker exec -it minio mc alias set local http://localhost:9092 admin <MINIO_PASSWORD>
+docker exec -it apa-minio mc alias set local http://localhost:9000 admin <MINIO_PASSWORD>
-docker exec -it minio mc mb local/documents
+docker exec -it apa-minio mc mb local/documents
-docker exec -it minio mc mb local/models
+docker exec -it apa-minio mc mb local/models
 ```
 **Create NATS Streams:**
 ```bash
-docker exec -it nats nats stream add TAX_AGENT_EVENTS \
+docker exec -it apa-nats nats stream add TAX_AGENT_EVENTS \\
  --subjects="tax.>" --storage=file --retention=limits --max-age=7d
 ```
 **Configure Authentik:**
-1. Go to https://authentik.harkon.co.uk
+1. Go to https://auth.harkon.co.uk
 2. Create groups: `app-admin`, `app-user`, `app-reviewer`
 3. Create OAuth providers for:
   - Review UI: `app.harkon.co.uk`
@@ -94,7 +94,7 @@ curl -I https://api.harkon.co.uk/healthz
 curl -I https://grafana.harkon.co.uk
 # View logs
-./scripts/deploy-to-production.sh logs svc-ingestion
+./scripts/deploy-to-production.sh logs apa-svc-ingestion
 ```
 ---
@@ -127,8 +127,8 @@ curl -I https://grafana.harkon.co.uk
 ### Restart Service
 ```bash
 ssh deploy@141.136.35.199
-cd /opt/compose/ai-tax-agent
+cd /opt/ai-tax-agent
-docker compose -f services.yaml restart svc-ingestion
+docker compose -f services.yaml restart apa-svc-ingestion
 ```
 ### Check Status
@@ -163,25 +163,25 @@ docker compose -f services.yaml logs svc-ingestion
 docker compose -f infrastructure.yaml ps
 # Restart
-docker compose -f services.yaml restart svc-ingestion
+docker compose -f services.yaml restart apa-svc-ingestion
 ```
 ### SSL Issues
 ```bash
 # Check Traefik logs
-docker logs traefik
+docker logs apa-traefik
 # Check certificates
-sudo cat /opt/compose/traefik/certs/godaddy-acme.json | jq
+sudo cat /opt/ai-tax-agent/traefik/certs/godaddy-acme.json | jq
 ```
 ### Database Connection
 ```bash
 # Test Postgres
-docker exec -it postgres pg_isready -U postgres
+docker exec -it apa-postgres pg_isready -U postgres
 # Check env vars
-docker exec -it svc-ingestion env | grep POSTGRES
+docker exec -it apa-svc-ingestion env | grep POSTGRES
 ```
 ---
@@ -190,7 +190,7 @@ docker exec -it svc-ingestion env | grep POSTGRES
 ```bash
 ssh deploy@141.136.35.199
-cd /opt/compose/ai-tax-agent
+cd /opt/ai-tax-agent
 # Stop services
 docker compose -f services.yaml down
@@ -198,12 +198,11 @@ docker compose -f infrastructure.yaml down
 docker compose -f monitoring.yaml down
 # Restore backup
-cd /opt/compose
+cd /opt
 tar -xzf ~/backups/backup-YYYYMMDD-HHMMSS.tar.gz
-# Restart company services
+# Restart application infra
-cd /opt/compose/traefik && docker compose up -d
+cd /opt/ai-tax-agent && docker compose -f infrastructure.yaml up -d
 cd /opt/compose/authentik && docker compose up -d
 ```
 ---
@@ -242,4 +241,3 @@ cd /opt/compose/authentik && docker compose up -d
 ```bash
 ./scripts/deploy-to-production.sh logs <service>
 ```
--- a/libs/config.py
+++ b/libs/config.py
--- a/infra/configs/authentik/bootstrap.yaml
+++ b/infra/configs/authentik/bootstrap.yaml
--- a/infra/configs/loki/loki-config.yml
+++ b/infra/configs/loki/loki-config.yml
--- a/infra/configs/promtail/promtail-config.yml
+++ b/infra/configs/promtail/promtail-config.yml
--- a/infra/base/traefik/config/traefik-dynamic.yml
+++ b/infra/base/traefik/config/traefik-dynamic.yml
@@ -16,3 +16,49 @@ http:
          - X-authentik-meta-provider
          - X-authentik-meta-app
          - X-authentik-meta-version
    # Large upload middleware for Gitea registry
    gitea-large-upload:
      buffering:
        maxRequestBodyBytes: 5368709120 # 5GB
        memRequestBodyBytes: 104857600 # 100MB
        maxResponseBodyBytes: 5368709120 # 5GB
        memResponseBodyBytes: 104857600 # 100MB
        retryExpression: "IsNetworkError() && Attempts() < 3"
    # Rate limiting for public APIs
    api-ratelimit:
      rateLimit:
        average: 100
        burst: 50
        period: 1s
    # Security headers
    security-headers:
      headers:
        frameDeny: true
        sslRedirect: true
        browserXssFilter: true
        contentTypeNosniff: true
        stsIncludeSubdomains: true
        stsPreload: true
        stsSeconds: 31536000
    # CORS headers
    api-cors:
      headers:
        accessControlAllowMethods:
          - GET
          - POST
          - PUT
          - DELETE
          - OPTIONS
        accessControlAllowOriginList:
          - "https://app.harkon.co.uk"
        accessControlAllowHeaders:
          - "Content-Type"
          - "Authorization"
        accessControlMaxAge: 100
        addVaryHeader: true
    # Security headers
--- a/infra/base/traefik/config/traefik.yml
+++ b/infra/base/traefik/config/traefik.yml
@@ -4,7 +4,9 @@ entryPoints:
    address: ":80"
  websecure:
    address: ":443"
-
+    transport:
      respondingTimeouts:
        readTimeout: 30m
 api:
  dashboard: true
--- a/infra/configs/traefik/app-middlewares.yml
+++ b/infra/configs/traefik/app-middlewares.yml
@@ -1,31 +0,0 @@
 # Application-specific Traefik middlewares
 # These are loaded by the application infrastructure, not the external Traefik
 http:
  middlewares:
    # Large upload middleware for Gitea registry
    gitea-large-upload:
      buffering:
        maxRequestBodyBytes: 5368709120   # 5GB
        memRequestBodyBytes: 104857600    # 100MB
        maxResponseBodyBytes: 5368709120  # 5GB
        memResponseBodyBytes: 104857600   # 100MB
        retryExpression: "IsNetworkError() && Attempts() < 3"
    # Rate limiting for public APIs
    api-ratelimit:
      rateLimit:
        average: 100
        burst: 50
        period: 1s
    # Security headers
    security-headers:
      headers:
        frameDeny: true
        sslRedirect: true
        browserXssFilter: true
        contentTypeNosniff: true
        stsIncludeSubdomains: true
        stsPreload: true
        stsSeconds: 31536000
--- a/infra/configs/traefik/certs/local.crt
+++ b/infra/configs/traefik/certs/local.crt
@@ -1,25 +0,0 @@
 -----BEGIN CERTIFICATE-----
 MIIEHjCCAwagAwIBAgIUbOm5g4Xhb08Lk6DIpVst7+xZHOswDQYJKoZIhvcNAQEL
 BQAwEDEOMAwGA1UEAwwFbG9jYWwwHhcNMjUwOTI4MTExNTM1WhcNMzUwOTI2MTEx
 NTM1WjAQMQ4wDAYDVQQDDAVsb2NhbDCCASIwDQYJKoZIhvcNAQEBBQADggEPADCC
 AQoCggEBAK0370DEo3dScS8uLwBsXkuaAHn9wO2fjxEHLZwHWfFo/16t+EEAi5c3
 zDs7nYQ7LPLndxBfO6xZ5uWKNIVtp6ARzAeRbGgbjXDdK3fOyRdhhKR3aZVOH1D0
 xUjEm/X5jEDv81sufSjk+DIQmh8hQnp3RwdHyhkIZUCTsBXMfnj+zs1UKTdRQBF5
 SUplGsbh6z3xCSI4jiNRb7mNHXqV3Fv6ycwF8YdthSDfueltBP4vT/CDtebkkKPF
 dx7YWEIPPUNqEoHqeI5iYP6gnWJYcr3vU+p2BuTwUICo+njzAf+P/SsjPHbujJob
 dbHUclBHIrIO4BpYZtY1a7E219MbqcECAwEAAaOCAW4wggFqMB0GA1UdDgQWBBQ7
 qHpza0Bb1xI1g7cMBx33JnFQljAfBgNVHSMEGDAWgBQ7qHpza0Bb1xI1g7cMBx33
 JnFQljAPBgNVHRMBAf8EBTADAQH/MIIBFQYDVR0RBIIBDDCCAQiCCWxvY2FsaG9z
 dIcEfwAAAYILKi5sb2NhbC5sYW6CDmF1dGgubG9jYWwubGFughFncmFmYW5hLmxv
 Y2FsLmxhboIQcmV2aWV3LmxvY2FsLmxhboINYXBpLmxvY2FsLmxhboIPdmF1bHQu
 bG9jYWwubGFugg9taW5pby5sb2NhbC5sYW6CE21pbmlvLWFwaS5sb2NhbC5sYW6C
 EHFkcmFudC5sb2NhbC5sYW6CD25lbzRqLmxvY2FsLmxhboIUcHJvbWV0aGV1cy5s
 b2NhbC5sYW6CDmxva2kubG9jYWwubGFughF1bmxlYXNoLmxvY2FsLmxhboIRdHJh
 ZWZpay5sb2NhbC5sYW4wDQYJKoZIhvcNAQELBQADggEBAICf+2MZ7BHbSD/pnvll
 G7Zmk+Bntj2F6RBQVZ2ZsKPWkHeZEYJDRvU0I2uL5tvvDJp4q0hjdluJllchhGgr
 qfu7i+kRnhzme7oyRTFGYp8b3zHBvLyJLmdIALxuNSjIEeh1Fx0lEhKwqOlA4y6T
 jziPmsGv3IonGJM2dURGNcR7DfG6H/Yl12qV8u/tVFTxqWL+hyCE7u8v+ZIcZ+fj
 82X7hXt1HvfP84EhVtfqQMb5xykLtXvPqggSCFXYIj2PanWdwEdE6P5Yr2D1Yz7r
 tzpmpoetrGoMWIeB0yiWgt0qJ/KK7meoCp64mqfBc48p1p/7kj2R/FRH1Jx3gFWy
 dT4=
 -----END CERTIFICATE-----
--- a/infra/configs/traefik/certs/local.key
+++ b/infra/configs/traefik/certs/local.key
@@ -1,28 +0,0 @@
 -----BEGIN PRIVATE KEY-----
 MIIEvAIBADANBgkqhkiG9w0BAQEFAASCBKYwggSiAgEAAoIBAQCtN+9AxKN3UnEv
 Li8AbF5LmgB5/cDtn48RBy2cB1nxaP9erfhBAIuXN8w7O52EOyzy53cQXzusWebl
 ijSFbaegEcwHkWxoG41w3St3zskXYYSkd2mVTh9Q9MVIxJv1+YxA7/NbLn0o5Pgy
 EJofIUJ6d0cHR8oZCGVAk7AVzH54/s7NVCk3UUAReUlKZRrG4es98QkiOI4jUW+5
 jR16ldxb+snMBfGHbYUg37npbQT+L0/wg7Xm5JCjxXce2FhCDz1DahKB6niOYmD+
 oJ1iWHK971Pqdgbk8FCAqPp48wH/j/0rIzx27oyaG3Wx1HJQRyKyDuAaWGbWNWux
 NtfTG6nBAgMBAAECggEAHvtkNcd2HX+HcxLloUPA0fDnqOo0OcxSQI9yHvhJpB5N
 nterEaVRUmjOhMGy+NXEwmWYLDt8ZuVloSTJJBxq4PyN68SdCTn0YH2Oqs03tpDg
 srIRFn10qHw/VTalVqed6HeCpYp5JHlf00SY7Hx8cX8oGytCAJw50AUad6ut62IM
 sp/QFdtkLhtq9vGzQUqyIP92Y/+GbxhB+eHkuvvFau1KJq7K8qhroFTwQFts9er2
 890Ujmz3bF2RhHixQcpXpsf/DMyylGJTbZDmSFkTDa/c1PzqvKrmL3wP7A3bk1E5
 CP8/a65ykotJEX8RkWqH2XxvRKpdWtCaeuCsmWUQ4QKBgQDTLbC9DWHCUYMWJhyW
 TKAeXx5xFGHIqggN28lIkXFiCVsTZyOuRDN7Q/CbOat/0JthrzyP18L+6ewZt2ZN
 RjdfGdnpUCJx6LR4dtBH8Rc+CjlSnqEgJIkgfIs8b9uEhMI1eQV+BAFQON3BzdpT
 wQ86aGsrdqtpfav7cImVfGcY/QKBgQDR+7OcnEwh8s/1J2niMKjk8agyCGGHWW4M
 g+vIv7lptavgEGOPMBv7QgmeuUjwSszphQXL36m39ZRmI5B+J0/onuQzv04tJeZY
 WZhA+T12a+1VnvUZNZm/qp0I2rW+4m+DmJoLQlvpaaFit/1fPJ6+IzI2VzPeWhw2
 vUQ5QIYhFQKBgFUWZc3mpGsNOMol1QLiIOnb3YImejfF+rTKx9FLeOnNZzrsJb5D
 kJKsDzgcBnPbc5/qYXZ7sv/O9OhvsvKTxh+1ZM3TEe3fm0emZ8l05K6EpBAcBkPT
 NMU4KUnSsBo2+6Fb/9CEgJr4LrG15bA1a5NXG0dJ60r37eHDuEvY8hlpAoGADWv2
 PhNrdlwL2NKtHO0ZTpD3vEL24OzhcOFZx9ohYtVe6BKEGpnrn/LHpKKZO+q8EE0V
 YsOoGH8U/jZVvQqMPAUz9u7Kc25Ru+H2Lmj/+brKT8e6SOM5MZwZL4CzT0Ev+Yxe
 hEu4jkHXM/Uot9arGuIrCngmc5b06LbOTo6GREUCgYArWyPYeETah/GVwU7/TNY5
 5f8lNbWBoXZfpVbWdoUZT6tGWciZsiXSR4x9f+1/LMIuChegSEazrJUDt7TbCkZs
 s4A66pnME37aYP2sMvJF3zSnQWVIyBgGI5xX0XW/WdozKl1mdFfigyWp58uo2dS2
 TxE3dy8rxpUdDCUmvJT/Fw==
 -----END PRIVATE KEY-----
--- a/libs/neo/client.py
+++ b/libs/neo/client.py
@@ -134,7 +134,7 @@ class Neo4jClient:
        result = await self.run_query(query, {"properties": properties}, database)
        node = result[0]["n"] if result else {}
        # Return node ID if available, otherwise return the full node
-        return node.get("id", node)
+        return node.get("id", node)  # type: ignore
    async def update_node(
        self,
@@ -209,7 +209,7 @@ class Neo4jClient:
                database,
            )
            rel = result[0]["r"] if result else {}
-            return rel.get("id", rel)
+            return rel.get("id", rel)  # type: ignore
        # Original signature (using labels and IDs)
        rel_properties = properties or {}
@@ -231,7 +231,7 @@ class Neo4jClient:
        )
        rel = result[0]["r"] if result else {}
        # Return relationship ID if available, otherwise return the full relationship
-        return rel.get("id", rel)
+        return rel.get("id", rel)  # type: ignore
    async def get_node_lineage(
        self, node_id: str, max_depth: int = 10, database: str = "neo4j"
--- a/infra/configs/traefik/certs/acme.json
+++ b/infra/configs/traefik/certs/acme.json
--- a/libs/ocr/processor.py
+++ b/libs/ocr/processor.py
@@ -0,0 +1,507 @@
 import base64
 import concurrent.futures
 import io
 import json
 import os
 from pathlib import Path
 from typing import Any
 import numpy as np
 import requests
 from PIL import Image, ImageFilter
 from PyPDF2 import PdfReader
 class OCRProcessor:
    def __init__(
        self,
        model_name: str = "llama3.2-vision:11b",
        base_url: str = "http://localhost:11434/api/generate",
        max_workers: int = 1,
        provider: str = "ollama",
        openai_api_key: str | None = None,
        openai_base_url: str = "https://api.openai.com/v1/chat/completions",
    ):
        self.model_name = model_name
        self.base_url = base_url
        self.max_workers = max_workers
        self.provider = provider.lower()
        self.openai_api_key = openai_api_key or os.getenv("OPENAI_API_KEY")
        self.openai_base_url = openai_base_url
    def _encode_image(self, image_path: str) -> str:
        """Convert image to base64 string"""
        with open(image_path, "rb") as image_file:
            return base64.b64encode(image_file.read()).decode("utf-8")
    def _pdf_to_images(self, pdf_path: str) -> list[str]:
        """
        Convert each page of a PDF to an image without PyMuPDF.
        Strategy: extract largest embedded image per page via PyPDF2.
        Saves each selected image as a temporary PNG and returns paths.
        Note: Text-only pages with no embedded images will be skipped here.
        Use _pdf_extract_text as a fallback for such pages.
        """
        image_paths: list[str] = []
        try:
            reader = PdfReader(pdf_path)
            for page_index, page in enumerate(reader.pages):
                try:
                    resources = page.get("/Resources")
                    if resources is None:
                        continue
                    xobject = resources.get("/XObject")
                    if xobject is None:
                        continue
                    xobject = xobject.get_object()
                    largest = None
                    largest_area = -1
                    for _, obj_ref in xobject.items():
                        try:
                            obj = obj_ref.get_object()
                            if obj.get("/Subtype") != "/Image":
                                continue
                            width = int(obj.get("/Width", 0))
                            height = int(obj.get("/Height", 0))
                            area = width * height
                            if area > largest_area:
                                largest = obj
                                largest_area = area
                        except Exception:
                            continue
                    if largest is None:
                        continue
                    data = largest.get_data()
                    filt = largest.get("/Filter")
                    out_path = f"{pdf_path}_page{page_index}.png"
                    # If JPEG/JPX, write bytes directly; else convert via PIL
                    if filt in ("/DCTDecode",):
                        # JPEG
                        out_path = f"{pdf_path}_page{page_index}.jpg"
                        with open(out_path, "wb") as f:
                            f.write(data)
                    elif filt in ("/JPXDecode",):
                        out_path = f"{pdf_path}_page{page_index}.jp2"
                        with open(out_path, "wb") as f:
                            f.write(data)
                    else:
                        mode = "RGB"
                        colorspace = largest.get("/ColorSpace")
                        if colorspace in ("/DeviceGray",):
                            mode = "L"
                        width = int(largest.get("/Width", 0))
                        height = int(largest.get("/Height", 0))
                        try:
                            img = Image.frombytes(mode, (width, height), data)
                        except Exception:
                            # Best-effort decode via Pillow
                            img = Image.open(io.BytesIO(data))
                        img.save(out_path, format="PNG")
                    image_paths.append(out_path)
                except Exception:
                    # Continue gracefully for problematic pages/objects
                    continue
            return image_paths
        except Exception as e:
            raise ValueError(f"Could not extract images from PDF: {e}")
    def _pdf_extract_text(self, pdf_path: str) -> list[str]:
        """Extract text per page using pdfplumber if available, else PyPDF2."""
        texts: list[str] = []
        try:
            try:
                import pdfplumber
                with pdfplumber.open(pdf_path) as pdf:
                    for page in pdf.pages:
                        texts.append(page.extract_text() or "")
                return texts
            except Exception:
                # Fallback to PyPDF2
                reader = PdfReader(pdf_path)
                for page in reader.pages:  # type: ignore
                    texts.append(page.extract_text() or "")
                return texts
        except Exception as e:
            raise ValueError(f"Could not extract text from PDF: {e}")
    def _call_ollama_vision(self, prompt: str, image_base64: str) -> str:
        payload = {
            "model": self.model_name,
            "prompt": prompt,
            "stream": False,
            "images": [image_base64],
        }
        response = requests.post(self.base_url, json=payload)
        response.raise_for_status()
        return response.json().get("response", "")  # type: ignore
    def _call_openai_vision(self, prompt: str, image_base64: str) -> str:
        if not self.openai_api_key:
            raise ValueError("OPENAI_API_KEY not set")
        # Compose chat.completions payload for GPT-4o/mini vision
        payload = {
            "model": self.model_name or "gpt-4o-mini",
            "messages": [
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": prompt},
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:image/jpeg;base64,{image_base64}",
                            },
                        },
                    ],
                }
            ],
            "temperature": 0,
        }
        headers = {
            "Authorization": f"Bearer {self.openai_api_key}",
            "Content-Type": "application/json",
        }
        response = requests.post(self.openai_base_url, headers=headers, json=payload)
        response.raise_for_status()
        data = response.json()
        try:
            return data["choices"][0]["message"]["content"]  # type: ignore
        except Exception:
            return json.dumps(data)
    def _preprocess_image(self, image_path: str, language: str = "en") -> str:
        """
        Preprocess image before OCR using Pillow + NumPy:
        - Convert to grayscale
        - Histogram equalization (contrast)
        - Median denoise
        - Otsu threshold and invert
        """
        try:
            with Image.open(image_path) as img:
                if img.mode in ("RGBA", "LA"):
                    img = img.convert("RGB")
                gray = img.convert("L")
                # Histogram equalization via cumulative distribution
                arr = np.asarray(gray)
                hist, _ = np.histogram(arr.flatten(), 256, [0, 256])  # type: ignore
                cdf = hist.cumsum()
                cdf_masked = np.ma.masked_equal(cdf, 0)  # type: ignore
                cdf_min = cdf_masked.min() if cdf_masked.size else 0
                cdf_max = cdf_masked.max() if cdf_masked.size else 0
                if cdf_max == cdf_min:
                    eq = arr
                else:
                    cdf_scaled = (cdf_masked - cdf_min) * 255 / (cdf_max - cdf_min)
                    lut = np.ma.filled(cdf_scaled, 0).astype("uint8")
                    eq = lut[arr]
                eq_img = Image.fromarray(eq, mode="L")
                # Median filter (3x3) to reduce noise
                eq_img = eq_img.filter(ImageFilter.MedianFilter(size=3))
                arr_eq = np.asarray(eq_img)
                # Otsu threshold
                hist2, _ = np.histogram(arr_eq, 256, [0, 256])  # type: ignore
                total = arr_eq.size
                sum_total = (np.arange(256) * hist2).sum()
                sum_b = 0.0
                w_b = 0.0
                max_var = 0.0
                thr = 0
                for t in range(256):
                    w_b += hist2[t]
                    if w_b == 0:
                        continue
                    w_f = total - w_b
                    if w_f == 0:
                        break
                    sum_b += t * hist2[t]
                    m_b = sum_b / w_b
                    m_f = (sum_total - sum_b) / w_f
                    var_between = w_b * w_f * (m_b - m_f) ** 2
                    if var_between > max_var:
                        max_var = var_between
                        thr = t
                binary = (arr_eq > thr).astype(np.uint8) * 255
                # Invert: black text on white background
                binary = 255 - binary
                out_img = Image.fromarray(binary, mode="L")
                preprocessed_path = f"{image_path}_preprocessed.jpg"
                out_img.save(preprocessed_path, format="JPEG", quality=95)
                return preprocessed_path
        except Exception as e:
            raise ValueError(f"Failed to preprocess image {image_path}: {e}")
    def process_image(
        self,
        image_path: str,
        format_type: str = "markdown",
        preprocess: bool = True,
        custom_prompt: str | None = None,
        language: str = "en",
    ) -> str:
        """
        Process an image (or PDF) and extract text in the specified format
        Args:
            image_path: Path to the image file or PDF file
            format_type: One of ["markdown", "text", "json", "structured", "key_value","custom"]
            preprocess: Whether to apply image preprocessing
            custom_prompt: If provided, this prompt overrides the default based on format_type
            language: Language code to apply language specific OCR preprocessing
        """
        try:
            # If the input is a PDF, process all pages
            if image_path.lower().endswith(".pdf"):
                image_pages = self._pdf_to_images(image_path)
                responses: list[str] = []
                if image_pages:
                    for idx, page_file in enumerate(image_pages):
                        # Process each page with preprocessing if enabled
                        if preprocess:
                            preprocessed_path = self._preprocess_image(
                                page_file, language
                            )
                        else:
                            preprocessed_path = page_file
                        image_base64 = self._encode_image(preprocessed_path)
                        if custom_prompt and custom_prompt.strip():
                            prompt = custom_prompt
                        else:
                            prompts = {
                                "markdown": f"""Extract all text content from this image in {language} **exactly as it appears**, without modification, summarization, or omission.
                                Format the output in markdown:
                                - Use headers (#, ##, ###) **only if they appear in the image**
                                - Preserve original lists (-, *, numbered lists) as they are
                                - Maintain all text formatting (bold, italics, underlines) exactly as seen
                                - **Do not add, interpret, or restructure any content**
                            """,
                                "text": f"""Extract all visible text from this image in {language} **without any changes**.
                                - **Do not summarize, paraphrase, or infer missing text.**
                                - Retain all spacing, punctuation, and formatting exactly as in the image.
                                - If text is unclear or partially visible, extract as much as possible without guessing.
                                - **Include all text, even if it seems irrelevant or repeated.**
                                """,
                                "json": f"""Extract all text from this image in {language} and format it as JSON, **strictly preserving** the structure.
                                - **Do not summarize, add, or modify any text.**
                                - Maintain hierarchical sections and subsections as they appear.
                                - Use keys that reflect the document's actual structure (e.g., "title", "body", "footer").
                                - Include all text, even if fragmented, blurry, or unclear.
                                """,
                                "structured": f"""Extract all text from this image in {language}, **ensuring complete structural accuracy**:
                                - Identify and format tables **without altering content**.
                                - Preserve list structures (bulleted, numbered) **exactly as shown**.
                                - Maintain all section headings, indents, and alignments.
                                - **Do not add, infer, or restructure the content in any way.**
                                """,
                                "key_value": f"""Extract all key-value pairs from this image in {language} **exactly as they appear**:
                                - Identify and extract labels and their corresponding values without modification.
                                - Maintain the exact wording, punctuation, and order.
                                - Format each pair as 'key: value' **only if clearly structured that way in the image**.
                                - **Do not infer missing values or add any extra text.**
                                """,
                                "table": f"""Extract all tabular data from this image in {language} **exactly as it appears**, without modification, summarization, or omission.
                                - **Preserve the table structure** (rows, columns, headers) as closely as possible.
                                - **Do not add missing values or infer content**—if a cell is empty, leave it empty.
                                - Maintain all numerical, textual, and special character formatting.
                                - If the table contains merged cells, indicate them clearly without altering their meaning.
                                - Output the table in a structured format such as Markdown, CSV, or JSON, based on the intended use.
                                """,
                            }
                            prompt = prompts.get(format_type, prompts["text"])
                        # Route to chosen provider
                        if self.provider == "openai":
                            res = self._call_openai_vision(prompt, image_base64)
                        else:
                            res = self._call_ollama_vision(prompt, image_base64)
                        responses.append(f"Page {idx + 1}:\n{res}")
                        # Clean up temporary files
                        if preprocess and preprocessed_path.endswith(
                            "_preprocessed.jpg"
                        ):
                            try:
                                os.remove(preprocessed_path)
                            except OSError:
                                pass
                        if page_file.endswith((".png", ".jpg", ".jp2")):
                            try:
                                os.remove(page_file)
                            except OSError:
                                pass
                    final_result = "\n".join(responses)
                    if format_type == "json":
                        try:
                            json_data = json.loads(final_result)
                            return json.dumps(json_data, indent=2)
                        except json.JSONDecodeError:
                            return final_result
                    return final_result
                else:
                    # Fallback: no images found; extract raw text per page
                    text_pages = self._pdf_extract_text(image_path)
                    combined = []
                    for i, t in enumerate(text_pages):
                        combined.append(f"Page {i + 1}:\n{t}")
                    return "\n".join(combined)
            # Process non-PDF images as before.
            if preprocess:
                image_path = self._preprocess_image(image_path, language)
            image_base64 = self._encode_image(image_path)
            # Clean up temporary files
            if image_path.endswith(("_preprocessed.jpg", "_temp.jpg")):
                os.remove(image_path)
            if custom_prompt and custom_prompt.strip():
                prompt = custom_prompt
                print("Using custom prompt:", prompt)
            else:
                prompts = {
                    "markdown": f"""Extract all text content from this image in {language} **exactly as it appears**, without modification, summarization, or omission.
                                Format the output in markdown:
                                - Use headers (#, ##, ###) **only if they appear in the image**
                                - Preserve original lists (-, *, numbered lists) as they are
                                - Maintain all text formatting (bold, italics, underlines) exactly as seen
                                - **Do not add, interpret, or restructure any content**
                            """,
                    "text": f"""Extract all visible text from this image in {language} **without any changes**.
                                - **Do not summarize, paraphrase, or infer missing text.**
                                - Retain all spacing, punctuation, and formatting exactly as in the image.
                                - If text is unclear or partially visible, extract as much as possible without guessing.
                                - **Include all text, even if it seems irrelevant or repeated.**
                                """,
                    "json": f"""Extract all text from this image in {language} and format it as JSON, **strictly preserving** the structure.
                                - **Do not summarize, add, or modify any text.**
                                - Maintain hierarchical sections and subsections as they appear.
                                - Use keys that reflect the document's actual structure (e.g., "title", "body", "footer").
                                - Include all text, even if fragmented, blurry, or unclear.
                                """,
                    "structured": f"""Extract all text from this image in {language}, **ensuring complete structural accuracy**:
                                - Identify and format tables **without altering content**.
                                - Preserve list structures (bulleted, numbered) **exactly as shown**.
                                - Maintain all section headings, indents, and alignments.
                                - **Do not add, infer, or restructure the content in any way.**
                                """,
                    "key_value": f"""Extract all key-value pairs from this image in {language} **exactly as they appear**:
                                - Identify and extract labels and their corresponding values without modification.
                                - Maintain the exact wording, punctuation, and order.
                                - Format each pair as 'key: value' **only if clearly structured that way in the image**.
                                - **Do not infer missing values or add any extra text.**
                                """,
                    "table": f"""Extract all tabular data from this image in {language} **exactly as it appears**, without modification, summarization, or omission.
                                - **Preserve the table structure** (rows, columns, headers) as closely as possible.
                                - **Do not add missing values or infer content**—if a cell is empty, leave it empty.
                                - Maintain all numerical, textual, and special character formatting.
                                - If the table contains merged cells, indicate them clearly without altering their meaning.
                                - Output the table in a structured format such as Markdown, CSV, or JSON, based on the intended use.
                                """,
                }
                prompt = prompts.get(format_type, prompts["text"])
                print("Using default prompt:", prompt)  # Debug print
            # Call chosen provider with single image
            if self.provider == "openai":
                result = self._call_openai_vision(prompt, image_base64)
            else:
                result = self._call_ollama_vision(prompt, image_base64)
            if format_type == "json":
                try:
                    json_data = json.loads(result)
                    return json.dumps(json_data, indent=2)
                except json.JSONDecodeError:
                    return str(result)
            return str(result)
        except Exception as e:
            return f"Error processing image: {str(e)}"
    def process_batch(
        self,
        input_path: str | list[str],
        format_type: str = "markdown",
        recursive: bool = False,
        preprocess: bool = True,
        custom_prompt: str | None = None,
        language: str = "en",
    ) -> dict[str, Any]:
        """
        Process multiple images in batch
        Args:
            input_path: Path to directory or list of image paths
            format_type: Output format type
            recursive: Whether to search directories recursively
            preprocess: Whether to apply image preprocessing
            custom_prompt: If provided, this prompt overrides the default for each image
            language: Language code to apply language specific OCR preprocessing
        Returns:
            Dictionary with results and statistics
        """
        # Collect all image paths
        image_paths: list[str | Path] = []
        if isinstance(input_path, str):
            base_path = Path(input_path)
            if base_path.is_dir():
                pattern = "**/*" if recursive else "*"
                for ext in [".png", ".jpg", ".jpeg", ".pdf", ".tiff"]:
                    image_paths.extend(base_path.glob(f"{pattern}{ext}"))
            else:
                image_paths = [base_path]
        else:
            image_paths = [Path(p) for p in input_path]
        results = {}
        errors = {}
        # Process images in parallel
        with concurrent.futures.ThreadPoolExecutor(
            max_workers=self.max_workers
        ) as executor:
            future_to_path = {
                executor.submit(
                    self.process_image,
                    str(path),
                    format_type,
                    preprocess,
                    custom_prompt,
                    language,
                ): path
                for path in image_paths
            }
            for future in concurrent.futures.as_completed(future_to_path):
                path = future_to_path[future]
                try:
                    results[str(path)] = future.result()
                except Exception as e:
                    errors[str(path)] = str(e)
                    # pbar.update(1)
        return {
            "results": results,
            "errors": errors,
            "statistics": {
                "total": len(image_paths),
                "successful": len(results),
                "failed": len(errors),
            },
        }
--- a/libs/requirements-base.txt
+++ b/libs/requirements-base.txt
@@ -1,13 +1,13 @@
 # Core framework dependencies (Required by all services)
-fastapi>=0.118.0
+fastapi>=0.119.0
 uvicorn[standard]>=0.37.0
-pydantic>=2.11.9
+pydantic>=2.12.0
 pydantic-settings>=2.11.0
 # Database drivers (lightweight)
-sqlalchemy>=2.0.43
+sqlalchemy>=2.0.44
 asyncpg>=0.30.0
-psycopg2-binary>=2.9.10
+psycopg2-binary>=2.9.11
 neo4j>=6.0.2
 redis[hiredis]>=6.4.0
--- a/libs/requirements-pdf.txt
+++ b/libs/requirements-pdf.txt
@@ -3,3 +3,4 @@ pdfrw>=0.4
 reportlab>=4.4.4
 PyPDF2>=3.0.1
 pdfplumber>=0.11.7
 opencv-python
--- a/libs/storage/client.py
+++ b/libs/storage/client.py
@@ -79,7 +79,7 @@ class StorageClient:
        """Download object from bucket"""
        try:
            response = self.client.get_object(bucket_name, object_name)
-            data = response.read()
+            data: bytes = response.read()
            response.close()
            response.release_conn()
@@ -89,7 +89,7 @@ class StorageClient:
                object=object_name,
                size=len(data),
            )
-            return data  # type: ignore
+            return data
        except S3Error as e:
            logger.error(
--- a/mypy.ini
+++ b/mypy.ini
@@ -18,3 +18,7 @@ disallow_untyped_defs = False
 [mypy-minio.*]
 ignore_missing_imports = True
 [mypy-pytesseract.*]
 follow_untyped_imports = True
 ignore_missing_imports = True
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -54,11 +54,20 @@ dependencies = [
    "pytesseract>=0.3.10",
    "Pillow>=10.1.0",
    "playwright>=1.40.0",
-    "pyshaql>=0.25.0",
+    "pyshacl>=0.25.0",
    "rdflib>=7.0.0",
    "spacy>=3.7.0",
    "presidio-analyzer>=2.2.0",
    "presidio-anonymizer>=2.2.0",
    "jsonschema>=4.0.0",
    "boto3>=1.0.0",
    "aiokafka>=0.8.0",
    "hvac>=1.0.0",
    "nats-py>=2.0.0",
    "pydantic-settings>=2.0.0",
    "opentelemetry-exporter-otlp>=1.0.0",
    "opentelemetry-instrumentation-psycopg2>=0.42b0",
    "opentelemetry-instrumentation-redis>=0.42b0",
 ]
 [project.optional-dependencies]
--- a/requirements.txt
+++ b/requirements.txt
@@ -56,6 +56,10 @@ numpy>=2.3.3
 # PDF processing
 pdfrw>=0.4
 reportlab>=4.4.4
 PyPDF2>=3.0.1
 pdf2image>=1.17.0
 pytesseract>=0.3.10
 Pillow>=10.3.0
 # Date and time utilities
 python-dateutil>=2.9.0
@@ -94,3 +98,4 @@ black>=25.9.0
 isort>=6.0.1
 bandit>=1.8.6
 safety>=3.6.2
 opencv-python
--- a/scripts/deploy-to-production.sh
+++ b/scripts/deploy-to-production.sh
@@ -7,9 +7,9 @@ set -e
 # Configuration
 REMOTE_HOST="deploy@141.136.35.199"
-REMOTE_PATH="/opt/compose/ai-tax-agent"
+REMOTE_PATH="/opt/ai-tax-agent"
-LOCAL_COMPOSE_PATH="infra/compose/production"
+LOCAL_COMPOSE_PATH="infra/base"
-ENV_FILE="infra/compose/.env.production"
+ENV_FILE="infra/environments/production/.env"
 # Colors for output
 RED='\033[0;31m'
@@ -66,13 +66,15 @@ backup_remote() {
    ssh $REMOTE_HOST << 'EOF'
        set -e
        mkdir -p ~/backups
-        cd /opt/compose
+        cd /opt
-        # Backup compose directory (exclude large cert files)
+        # Backup application directory (exclude large cert files)
-        tar -czf ~/backups/backup-$(date +%Y%m%d-%H%M%S).tar.gz \
+        if [ -d ai-tax-agent ]; then
-            --exclude='./traefik/certs/godaddy-acme.json' \
+            tar -czf ~/backups/backup-$(date +%Y%m%d-%H%M%S).tar.gz \
-            --exclude='./*/node_modules' \
+                --exclude='./traefik/certs/godaddy-acme.json' \
-            .
+                --exclude='./*/node_modules' \
                ai-tax-agent
        fi
        # Document current state
        docker ps > ~/backups/current-services-$(date +%Y%m%d-%H%M%S).txt
@@ -100,6 +102,9 @@ prepare_remote() {
        mkdir -p $REMOTE_PATH/grafana/provisioning
        mkdir -p $REMOTE_PATH/grafana/dashboards
        mkdir -p $REMOTE_PATH/loki
        mkdir -p $REMOTE_PATH/promtail
        mkdir -p $REMOTE_PATH/traefik/config
        mkdir -p $REMOTE_PATH/authentik
        echo "Directory structure created"
        ls -la $REMOTE_PATH
@@ -110,7 +115,7 @@ EOF
 # Copy files to remote server
 copy_files() {
-    log_info "Copying compose files to remote server..."
+    log_info "Copying base compose files and configs to remote server..."
    # Copy compose files
    scp $LOCAL_COMPOSE_PATH/infrastructure.yaml $REMOTE_HOST:$REMOTE_PATH/
@@ -121,10 +126,13 @@ copy_files() {
    scp $ENV_FILE $REMOTE_HOST:$REMOTE_PATH/.env
    # Copy configuration files
-    scp -r infra/compose/prometheus/* $REMOTE_HOST:$REMOTE_PATH/prometheus/
+    scp -r $LOCAL_COMPOSE_PATH/prometheus/* $REMOTE_HOST:$REMOTE_PATH/prometheus/
-    scp -r infra/compose/grafana/provisioning/* $REMOTE_HOST:$REMOTE_PATH/grafana/provisioning/
+    scp -r $LOCAL_COMPOSE_PATH/grafana/provisioning/* $REMOTE_HOST:$REMOTE_PATH/grafana/provisioning/
-    scp -r infra/compose/grafana/dashboards/* $REMOTE_HOST:$REMOTE_PATH/grafana/dashboards/
+    scp -r $LOCAL_COMPOSE_PATH/grafana/dashboards/* $REMOTE_HOST:$REMOTE_PATH/grafana/dashboards/
-    scp -r infra/compose/loki/* $REMOTE_HOST:$REMOTE_PATH/loki/
+    scp -r $LOCAL_COMPOSE_PATH/loki/* $REMOTE_HOST:$REMOTE_PATH/loki/
    scp -r $LOCAL_COMPOSE_PATH/promtail/* $REMOTE_HOST:$REMOTE_PATH/promtail/ 2>/dev/null || true
    scp -r $LOCAL_COMPOSE_PATH/traefik/config/* $REMOTE_HOST:$REMOTE_PATH/traefik/config/ 2>/dev/null || true
    scp -r $LOCAL_COMPOSE_PATH/authentik/* $REMOTE_HOST:$REMOTE_PATH/authentik/ 2>/dev/null || true
    log_success "Files copied to remote server"
 }
--- a/tests/e2e/test_happy_path.py
+++ b/tests/e2e/test_happy_path.py
@@ -1,555 +1,4 @@
-# ROLE
+import pytest
-You are a **Senior Platform Engineer + Backend Lead** generating **production code** and **ops assets** for a microservice suite that powers an accounting Knowledge Graph + Vector RAG platform. Authentication/authorization are centralized at the **edge via Traefik + Authentik** (ForwardAuth). **Services are trust-bound** to Traefik and consume user/role claims via forwarded headers/JWT.
+def test_happy_path():
-
+    pass
 # MISSION
 Produce fully working code for **all application services** (FastAPI + Python 3.12) with:
 - Solid domain models, Pydantic v2 schemas, type hints, strict mypy, ruff lint.
 - Opentelemetry tracing, Prometheus metrics, structured logging.
 - Vault-backed secrets, MinIO S3 client, Qdrant client, Neo4j driver, Postgres (SQLAlchemy), Redis.
 - Eventing (Kafka or SQS/SNS behind an interface).
 - Deterministic data contracts, end-to-end tests, Dockerfiles, Compose, CI for Gitea.
 - Traefik labels + Authentik Outpost integration for every exposed route.
 - Zero PII in vectors (Qdrant), evidence-based lineage in KG, and bitemporal writes.
 # GLOBAL CONSTRAINTS (APPLY TO ALL SERVICES)
 - **Language & Runtime:** Python **3.12**.
 - **Frameworks:** FastAPI, Pydantic v2, SQLAlchemy 2, httpx, aiokafka or boto3 (pluggable), redis-py, opentelemetry-instrumentation-fastapi, prometheus-fastapi-instrumentator.
 - **Config:** `pydantic-settings` with `.env` overlay. Provide `Settings` class per service.
 - **Secrets:** HashiCorp **Vault** (AppRole/JWT). Use Vault Transit to **envelope-encrypt** sensitive fields before persistence (helpers provided in `lib/security.py`).
 - **Auth:** No OIDC in services. Add `TrustedProxyMiddleware`:
  - Reject if request not from internal network (configurable CIDR).
  - Require headers set by Traefik+Authentik (`X-Authenticated-User`, `X-Authenticated-Email`, `X-Authenticated-Groups`, `Authorization: Bearer …`).
  - Parse groups → `roles` list on `request.state`.
 - **Observability:**
  - OpenTelemetry (traceparent propagation), span attrs (service, route, user, tenant).
  - Prometheus metrics endpoint `/metrics` protected by internal network check.
  - Structured JSON logs (timestamp, level, svc, trace_id, msg) via `structlog`.
 - **Errors:** Global exception handler → RFC7807 Problem+JSON (`type`, `title`, `status`, `detail`, `instance`, `trace_id`).
 - **Testing:** `pytest`, `pytest-asyncio`, `hypothesis` (property tests for calculators), `coverage ≥ 90%` per service.
 - **Static:** `ruff`, `mypy --strict`, `bandit`, `safety`, `licensecheck`.
 - **Perf:** Each service exposes `/healthz`, `/readyz`, `/livez`; cold start < 500ms; p95 endpoint < 250ms (local).
 - **Containers:** Distroless or slim images; non-root user; read-only FS; `/tmp` mounted for OCR where needed.
 - **Docs:** OpenAPI JSON + ReDoc; MkDocs site with service READMEs.
 # SHARED LIBS (GENERATE ONCE, REUSE)
 Create `libs/` used by all services:
 - `libs/config.py` – base `Settings`, env parsing, Vault client factory, MinIO client factory, Qdrant client factory, Neo4j driver factory, Redis factory, Kafka/SQS client factory.
 - `libs/security.py` – Vault Transit helpers (`encrypt_field`, `decrypt_field`), header parsing, internal-CIDR validator.
 - `libs/observability.py` – otel init, prometheus instrumentor, logging config.
 - `libs/events.py` – abstract `EventBus` with `publish(topic, payload: dict)`, `subscribe(topic, handler)`. Two impls: Kafka (`aiokafka`) and SQS/SNS (`boto3`).
 - `libs/schemas.py` – **canonical Pydantic models** shared across services (Document, Evidence, IncomeItem, etc.) mirroring the ontology schemas. Include JSONSchema exports.
 - `libs/storage.py` – S3/MinIO helpers (bucket ensure, put/get, presigned).
 - `libs/neo.py` – Neo4j session helpers, Cypher runner with retry, SHACL validator invoker (pySHACL on exported RDF).
 - `libs/rag.py` – Qdrant collections CRUD, hybrid search (dense+sparse), rerank wrapper, de-identification utilities (regex + NER; hash placeholders).
 - `libs/forms.py` – PDF AcroForm fill via `pdfrw` with overlay fallback via `reportlab`.
 - `libs/calibration.py` – `calibrated_confidence(raw_score, method="temperature_scaling", params=...)`.
 # EVENT TOPICS (STANDARDIZE)
 - `doc.ingested`, `doc.ocr_ready`, `doc.extracted`, `kg.upserted`, `rag.indexed`, `calc.schedule_ready`, `form.filled`, `hmrc.submitted`, `review.requested`, `review.completed`, `firm.sync.completed`
 Each payload MUST include: `event_id (ulid)`, `occurred_at (iso)`, `actor`, `tenant_id`, `trace_id`, `schema_version`, and a `data` object (service-specific).
 # TRUST HEADERS FROM TRAEFIK + AUTHENTIK (USE EXACT KEYS)
 - `X-Authenticated-User` (string)
 - `X-Authenticated-Email` (string)
 - `X-Authenticated-Groups` (comma-separated)
 - `Authorization` (`Bearer <jwt>` from Authentik)
  Reject any request missing these (except `/healthz|/readyz|/livez|/metrics` from internal CIDR).
 ---
 ## SERVICES TO IMPLEMENT (CODE FOR EACH)
 ### 1) `svc-ingestion`
 **Purpose:** Accept uploads or URLs, checksum, store to MinIO, emit `doc.ingested`.
 **Endpoints:**
 - `POST /v1/ingest/upload` (multipart file, metadata: `tenant_id`, `kind`, `source`) → `{doc_id, s3_url, checksum}`
 - `POST /v1/ingest/url` (json: `{url, kind, tenant_id}`) → downloads to MinIO
 - `GET /v1/docs/{doc_id}` → metadata
 **Logic:**
 - Compute SHA256, dedupe by checksum; MinIO path `tenants/{tenant_id}/raw/{doc_id}.pdf`.
 - Store metadata in Postgres table `ingest_documents` (alembic migrations).
 - Publish `doc.ingested` with `{doc_id, bucket, key, pages?, mime}`.
 **Env:** `S3_BUCKET_RAW`, `MINIO_*`, `DB_URL`.
 **Traefik labels:** route `/ingest/*`.
 ---
 ### 2) `svc-rpa`
 **Purpose:** Scheduled RPA pulls from firm/client portals via Playwright.
 **Tasks:**
 - Playwright login flows (credentials from Vault), 2FA via Authentik OAuth device or OTP secret in Vault.
 - Download statements/invoices; hand off to `svc-ingestion` via internal POST.
 - Prefect flows: `pull_portal_X()`, `pull_portal_Y()` with schedules.
 **Endpoints:**
 - `POST /v1/rpa/run/{connector}` (manual trigger)
 - `GET /v1/rpa/status/{run_id}`
 **Env:** `VAULT_ADDR`, `VAULT_ROLE_ID`, `VAULT_SECRET_ID`.
 ---
 ### 3) `svc-ocr`
 **Purpose:** OCR & layout extraction.
 **Pipeline:**
 - Pull object from MinIO, detect rotation/de-skew (`opencv-python`), split pages (`pymupdf`), OCR (`pytesseract`) or bypass if text layer present (`pdfplumber`).
 - Output per-page text + **bbox** for lines/words.
 - Write JSON to MinIO `tenants/{tenant_id}/ocr/{doc_id}.json` and emit `doc.ocr_ready`.
 **Endpoints:**
 - `POST /v1/ocr/{doc_id}` (idempotent trigger)
 - `GET /v1/ocr/{doc_id}` (fetch OCR JSON)
 **Env:** `TESSERACT_LANGS`, `S3_BUCKET_EVIDENCE`.
 ---
 ### 4) `svc-extract`
 **Purpose:** Classify docs and extract KV + tables into **schema-constrained JSON** (with bbox/page).
 **Endpoints:**
 - `POST /v1/extract/{doc_id}` body: `{strategy: "llm|rules|hybrid"}`
 - `GET /v1/extract/{doc_id}` → structured JSON
 **Implementation:**
 - Use prompt files in `prompts/`: `doc_classify.txt`, `kv_extract.txt`, `table_extract.txt`.
 - **Validator loop**: run LLM → validate JSONSchema → retry with error messages up to N times.
 - Return Pydantic models from `libs/schemas.py`.
 - Emit `doc.extracted`.
 **Env:** `LLM_ENGINE`, `TEMPERATURE`, `MAX_TOKENS`.
 ---
 ### 5) `svc-normalize-map`
 **Purpose:** Normalize & map extracted data to KG.
 **Logic:**
 - Currency normalization (ECB or static fx table), dates, UK tax year/basis period inference.
 - Entity resolution (blocking + fuzzy).
 - Generate nodes/edges (+ `Evidence` with doc_id/page/bbox/text_hash).
 - Use `libs/neo.py` to write with **bitemporal** fields; run **SHACL** validator; on violation, queue `review.requested`.
 - Emit `kg.upserted`.
 **Endpoints:**
 - `POST /v1/map/{doc_id}`
 - `GET /v1/map/{doc_id}/preview` (diff view, to be used by UI)
 **Env:** `NEO4J_*`.
 ---
 ### 6) `svc-kg`
 **Purpose:** Graph façade + RDF/SHACL utility.
 **Endpoints:**
 - `GET /v1/kg/nodes/{label}/{id}`
 - `POST /v1/kg/cypher` (admin-gated inline query; must check `admin` role)
 - `POST /v1/kg/export/rdf` (returns RDF for SHACL)
 - `POST /v1/kg/validate` (run pySHACL against `schemas/shapes.ttl`)
 - `GET /v1/kg/lineage/{node_id}` (traverse `DERIVED_FROM` → Evidence)
 **Env:** `NEO4J_*`.
 ---
 ### 7) `svc-rag-indexer`
 **Purpose:** Build Qdrant indices (firm knowledge, legislation, best practices, glossary).
 **Workflow:**
 - Load sources (filesystem, URLs, Firm DMS via `svc-firm-connectors`).
 - **De-identify PII** (regex + NER), replace with placeholders; store mapping only in Postgres.
 - Chunk (layout-aware) per `retrieval/chunking.yaml`.
 - Compute **dense** embeddings (e.g., `bge-small-en-v1.5`) and **sparse** (Qdrant sparse).
 - Upsert to Qdrant with payload `{jurisdiction, tax_years[], topic_tags[], version, pii_free: true, doc_id/section_id/url}`.
 - Emit `rag.indexed`.
 **Endpoints:**
 - `POST /v1/index/run`
 - `GET /v1/index/status/{run_id}`
 **Env:** `QDRANT_URL`, `RAG_EMBEDDING_MODEL`, `RAG_RERANKER_MODEL`.
 ---
 ### 8) `svc-rag-retriever`
 **Purpose:** Hybrid search + KG fusion with rerank and calibrated confidence.
 **Endpoint:**
 - `POST /v1/rag/search` `{query, tax_year?, jurisdiction?, k?}` →
  ```
  {
    "chunks": [...],
    "citations": [{doc_id|url, section_id?, page?, bbox?}],
    "kg_hints": [{rule_id, formula_id, node_ids[]}],
    "calibrated_confidence": 0.0-1.0
  }
  ```
 **Implementation:**
 - Hybrid score: `alpha * dense + beta * sparse`; rerank top-K via cross-encoder; **KG fusion** (boost chunks citing Rules/Calculations relevant to schedule).
 - Use `libs/calibration.py` to expose calibrated confidence.
 ---
 ### 9) `svc-reason`
 **Purpose:** Deterministic calculators + materializers (UK SA).
 **Endpoints:**
 - `POST /v1/reason/compute_schedule` `{tax_year, taxpayer_id, schedule_id}`
 - `GET /v1/reason/explain/{schedule_id}` → rationale & lineage paths
 **Implementation:**
 - Pure functions for: employment, self-employment, property (FHL, 20% interest credit), dividends/interest, allowances, NIC (Class 2/4), HICBC, student loans (Plans 1/2/4/5, PGL).
 - **Deterministic order** as defined; rounding per `FormBox.rounding_rule`.
 - Use Cypher from `kg/reasoning/schedule_queries.cypher` to materialize box values; attach `DERIVED_FROM` evidence.
 ---
 ### 10) `svc-forms`
 **Purpose:** Fill PDFs and assemble evidence bundles.
 **Endpoints:**
 - `POST /v1/forms/fill` `{tax_year, taxpayer_id, form_id}` → returns PDF (binary)
 - `POST /v1/forms/evidence_pack` `{scope}` → ZIP + manifest + signed hashes (sha256)
 **Implementation:**
 - `pdfrw` for AcroForm; overlay with ReportLab if needed.
 - Manifest includes `doc_id/page/bbox/text_hash` for every numeric field.
 ---
 ### 11) `svc-hmrc`
 **Purpose:** HMRC submitter (stub|sandbox|live).
 **Endpoints:**
 - `POST /v1/hmrc/submit` `{tax_year, taxpayer_id, dry_run}` → `{status, submission_id?, errors[]}`
 - `GET /v1/hmrc/submissions/{id}`
 **Implementation:**
 - Rate limits, retries/backoff, signed audit log; environment toggle.
 ---
 ### 12) `svc-firm-connectors`
 **Purpose:** Read-only connectors to Firm Databases (Practice Mgmt, DMS).
 **Endpoints:**
 - `POST /v1/firm/sync` `{since?}` → `{objects_synced, errors[]}`
 - `GET /v1/firm/objects` (paged)
 **Implementation:**
 - Data contracts in `config/firm_contracts/`; mappers → Secure Client Data Store (Postgres) with lineage columns (`source`, `source_id`, `synced_at`).
 ---
 ### 13) `ui-review` (outline only)
 - Next.js (SSO handled by Traefik+Authentik), shows extracted fields + evidence snippets; POST overrides to `svc-extract`/`svc-normalize-map`.
 ---
 ## DATA CONTRACTS (ESSENTIAL EXAMPLES)
 **Event: `doc.ingested`**
 ```json
 {
  "event_id": "01J...ULID",
  "occurred_at": "2025-09-13T08:00:00Z",
  "actor": "svc-ingestion",
  "tenant_id": "t_123",
  "trace_id": "abc-123",
  "schema_version": "1.0",
  "data": {
    "doc_id": "d_abc",
    "bucket": "raw",
    "key": "tenants/t_123/raw/d_abc.pdf",
    "checksum": "sha256:...",
    "kind": "bank_statement",
    "mime": "application/pdf",
    "pages": 12
  }
 }
 ```
 **RAG search response shape**
 ```json
 {
  "chunks": [
    {
      "id": "c1",
      "text": "...",
      "score": 0.78,
      "payload": {
        "jurisdiction": "UK",
        "tax_years": ["2024-25"],
        "topic_tags": ["FHL"],
        "pii_free": true
      }
    }
  ],
  "citations": [
    { "doc_id": "leg-ITA2007", "section_id": "s272A", "url": "https://..." }
  ],
  "kg_hints": [
    {
      "rule_id": "UK.FHL.Qual",
      "formula_id": "FHL_Test_v1",
      "node_ids": ["n123", "n456"]
    }
  ],
  "calibrated_confidence": 0.81
 }
 ```
 ---
 ## PERSISTENCE SCHEMAS (POSTGRES; ALEMBIC)
 - `ingest_documents(id pk, tenant_id, doc_id, kind, checksum, bucket, key, mime, pages, created_at)`
 - `firm_objects(id pk, tenant_id, source, source_id, type, payload jsonb, synced_at)`
 - Qdrant PII mapping table (if absolutely needed): `pii_links(id pk, placeholder_hash, client_id, created_at)` — **encrypt with Vault Transit**; do NOT store raw values.
 ---
 ## TRAEFIK + AUTHENTIK (COMPOSE LABELS PER SERVICE)
 For every service container in `infra/compose/docker-compose.local.yml`, add labels:
 ```
 - "traefik.enable=true"
 - "traefik.http.routers.svc-extract.rule=Host(`api.local`) && PathPrefix(`/extract`)"
 - "traefik.http.routers.svc-extract.entrypoints=websecure"
 - "traefik.http.routers.svc-extract.tls=true"
 - "traefik.http.routers.svc-extract.middlewares=authentik-forwardauth,rate-limit"
 - "traefik.http.services.svc-extract.loadbalancer.server.port=8000"
 ```
 Use the shared dynamic file `traefik-dynamic.yml` with `authentik-forwardauth` and `rate-limit` middlewares.
 ---
 ## OUTPUT FORMAT (STRICT)
 Implement a **multi-file codebase** as fenced blocks, EXACTLY in this order:
 ```txt
 # FILE: libs/config.py
 # factories for Vault/MinIO/Qdrant/Neo4j/Redis/EventBus, Settings base
 ...
 ```
 ```txt
 # FILE: libs/security.py
 # Vault Transit helpers, header parsing, internal CIDR checks, middleware
 ...
 ```
 ```txt
 # FILE: libs/observability.py
 # otel init, prometheus, structlog
 ...
 ```
 ```txt
 # FILE: libs/events.py
 # EventBus abstraction with Kafka and SQS/SNS impls
 ...
 ```
 ```txt
 # FILE: libs/schemas.py
 # Shared Pydantic models mirroring ontology entities
 ...
 ```
 ```txt
 # FILE: apps/svc-ingestion/main.py
 # FastAPI app, endpoints, MinIO write, Postgres, publish doc.ingested
 ...
 ```
 ```txt
 # FILE: apps/svc-rpa/main.py
 # Playwright flows, Prefect tasks, triggers
 ...
 ```
 ```txt
 # FILE: apps/svc-ocr/main.py
 # OCR pipeline, endpoints
 ...
 ```
 ```txt
 # FILE: apps/svc-extract/main.py
 # Classifier + extractors with validator loop
 ...
 ```
 ```txt
 # FILE: apps/svc-normalize-map/main.py
 # normalization, entity resolution, KG mapping, SHACL validation call
 ...
 ```
 ```txt
 # FILE: apps/svc-kg/main.py
 # KG façade, RDF export, SHACL validate, lineage traversal
 ...
 ```
 ```txt
 # FILE: apps/svc-rag-indexer/main.py
 # chunk/de-id/embed/upsert to Qdrant
 ...
 ```
 ```txt
 # FILE: apps/svc-rag-retriever/main.py
 # hybrid retrieval + rerank + KG fusion
 ...
 ```
 ```txt
 # FILE: apps/svc-reason/main.py
 # deterministic calculators, schedule compute/explain
 ...
 ```
 ```txt
 # FILE: apps/svc-forms/main.py
 # PDF fill + evidence pack
 ...
 ```
 ```txt
 # FILE: apps/svc-hmrc/main.py
 # submit stub|sandbox|live with audit + retries
 ...
 ```
 ```txt
 # FILE: apps/svc-firm-connectors/main.py
 # connectors to practice mgmt & DMS, sync to Postgres
 ...
 ```
 ```txt
 # FILE: infra/compose/docker-compose.local.yml
 # Traefik, Authentik, Vault, MinIO, Qdrant, Neo4j, Postgres, Redis, Prom+Grafana, Loki, Unleash, all services
 ...
 ```
 ```txt
 # FILE: infra/compose/traefik.yml
 # static Traefik config
 ...
 ```
 ```txt
 # FILE: infra/compose/traefik-dynamic.yml
 # forwardAuth middleware + routers/services
 ...
 ```
 ```txt
 # FILE: .gitea/workflows/ci.yml
 # lint->test->build->scan->push->deploy
 ...
 ```
 ```txt
 # FILE: Makefile
 # bootstrap, run, test, lint, build, deploy, format, seed
 ...
 ```
 ```txt
 # FILE: tests/e2e/test_happy_path.py
 # end-to-end: ingest -> ocr -> extract -> map -> compute -> fill -> (stub) submit
 ...
 ```
 ```txt
 # FILE: tests/unit/test_calculators.py
 # boundary tests for UK SA logic (NIC, HICBC, PA taper, FHL)
 ...
 ```
 ```txt
 # FILE: README.md
 # how to run locally with docker-compose, Authentik setup, Traefik certs
 ...
 ```
 ## DEFINITION OF DONE
 - `docker compose up` brings the full stack up; SSO via Authentik; routes secured via Traefik ForwardAuth.
 - Running `pytest` yields ≥ 90% coverage; `make e2e` passes the ingest→…→submit stub flow.
 - All services expose `/healthz|/readyz|/livez|/metrics`; OpenAPI at `/docs`.
 - No PII stored in Qdrant; vectors carry `pii_free=true`.
 - KG writes are SHACL-validated; violations produce `review.requested` events.
 - Evidence lineage is present for every numeric box value.
 - Gitea pipeline passes: lint, test, build, scan, push, deploy.
 # START
 Generate the full codebase and configs in the **exact file blocks and order** specified above.