deployment, linting and infra configuration

2025-10-14 07:42:31 +01:00
parent f0f7674b8d
commit eea46ac89c
41 changed files with 1017 additions and 1448 deletions
--- a/apps/svc_coverage/models.py
+++ b/apps/svc_coverage/models.py
@@ -1,6 +1,6 @@
 """Database models for coverage service."""

-# FILE: apps/svc-coverage/models.py
+# FILE: apps/svc_coverage/models.py

 from datetime import datetime

--- a/apps/svc_coverage/requirements.txt
+++ b/apps/svc_coverage/requirements.txt
@@ -0,0 +1,13 @@
+# Service-specific dependencies for svc_coverage
+
+# Database migrations
+alembic>=1.14.0
+
+# OpenTelemetry (required by libs.observability)
+opentelemetry-api>=1.37.0
+opentelemetry-sdk>=1.37.0
+opentelemetry-exporter-otlp-proto-grpc>=1.37.0
+opentelemetry-instrumentation-fastapi>=0.42b0
+opentelemetry-instrumentation-httpx>=0.42b0
+opentelemetry-instrumentation-psycopg2>=0.42b0
+opentelemetry-instrumentation-redis>=0.42b0
--- a/apps/svc_extract/requirements.txt
+++ b/apps/svc_extract/requirements.txt
@@ -1,17 +1,17 @@
 # Service-specific dependencies for svc_extract
 # LLM integration
-openai>=1.3.0
-anthropic>=0.7.0
+openai>=2.3.0
+anthropic>=0.69.0

 # JSON schema validation
-jsonschema>=4.20.0
+jsonschema>=4.25.1

 # Template processing
-jinja2>=3.1.0
+jinja2>=3.1.6

 # Text similarity (lightweight)
 fuzzywuzzy>=0.18.0
-python-Levenshtein>=0.23.0
+python-Levenshtein>=0.27.1

 # Data validation
-cerberus>=1.3.4
+cerberus>=1.3.7
--- a/apps/svc_firm_connectors/requirements.txt
+++ b/apps/svc_firm_connectors/requirements.txt
@@ -1,45 +1,45 @@
 # FastAPI and server
-fastapi>=0.104.1
-uvicorn[standard]>=0.24.0
+fastapi>=0.118.3
+uvicorn[standard]>=0.37.0
 pydantic>=2.5.0

 # Service-specific dependencies
 # Database connectors
-sqlalchemy>=2.0.0
-pymssql>=2.2.0
+sqlalchemy>=2.0.44
+pymssql>=2.3.7
 cx-Oracle>=8.3.0

 # API clients for practice management systems
-zeep>=4.2.0  # SOAP client
-xmltodict>=0.13.0
+zeep>=4.3.2  # SOAP client
+xmltodict>=1.0.2

 # OAuth for various systems
-authlib>=1.2.0
-requests-oauthlib>=1.3.0
+authlib>=1.6.5
+requests-oauthlib>=2.0.0

 # Data synchronization
-pandas>=2.1.0
+pandas>=2.3.3

 # Rate limiting
-ratelimit>=2.2.0
+ratelimit>=2.2.1

 # Retry mechanisms
-tenacity>=8.2.0
+tenacity>=9.1.2

 # CSV processing
-csvkit>=1.1.0
+csvkit>=2.1.0

 # Excel file processing
-openpyxl>=3.1.0
-xlrd>=2.0.0
+openpyxl>=3.1.5
+xlrd>=2.0.2

 # Data validation
-marshmallow>=3.20.0
-cerberus>=1.3.4
+marshmallow>=4.0.1
+cerberus>=1.3.7

 # Connection pooling (built into SQLAlchemy)
 # sqlalchemy-pool>=1.3.0  # Package doesn't exist, pooling is built into SQLAlchemy

 # Additional utilities
-python-dateutil>=2.8.0
-pytz>=2023.3
+python-dateutil>=2.9.0
+pytz>=2025.2
--- a/apps/svc_forms/requirements.txt
+++ b/apps/svc_forms/requirements.txt
@@ -1,37 +1,37 @@
 # FastAPI and server
-fastapi>=0.104.1
-uvicorn[standard]>=0.24.0
-pydantic>=2.5.0
+fastapi>=0.118.3
+uvicorn[standard]>=0.37.0
+pydantic>=2.12.0

 # Service-specific dependencies
 # PDF form filling
 pdfrw>=0.4
-reportlab>=4.0.0
+reportlab>=4.4.4

 # PDF processing
-PyPDF2>=3.0.0
-pypdf>=3.17.0
+PyPDF2>=3.0.1
+pypdf>=6.1.1

 # Image processing for overlays
-Pillow>=10.1.0
+Pillow>=11.3.0

 # ZIP file creation for evidence packs
 zipfile36>=0.1.3

 # Template processing
-jinja2>=3.1.0
+jinja2>=3.1.6

 # QR code generation
-qrcode>=7.4.0
+qrcode>=8.2

 # Barcode generation
-python-barcode>=0.15.0
+python-barcode>=0.16.1

 # Font handling
-fonttools>=4.44.0
+fonttools>=4.60.1

 # Additional PDF utilities
-pdfminer.six>=20231228
+pdfminer.six>=20250506

 # Document conversion
-python-docx>=1.1.0
+python-docx>=1.2.0
--- a/apps/svc_hmrc/requirements.txt
+++ b/apps/svc_hmrc/requirements.txt
@@ -1,40 +1,40 @@
 # FastAPI and server
-fastapi>=0.104.1
-uvicorn[standard]>=0.24.0
-pydantic>=2.5.0
+fastapi>=0.118.3
+uvicorn[standard]>=0.37.0
+pydantic>=2.12.0

 # Service-specific dependencies
 # OAuth and authentication
-authlib>=1.2.0
-oauthlib>=3.2.0
+authlib>=1.6.5
+oauthlib>=3.3.1

 # HTTP client with OAuth support
-requests-oauthlib>=1.3.0
+requests-oauthlib>=2.0.0

 # XML processing for HMRC APIs
-lxml>=4.9.0
-xmltodict>=0.13.0
+lxml>=6.0.2
+xmltodict>=1.0.2

 # JSON Web Tokens
-pyjwt>=2.8.0
+pyjwt>=2.10.1

 # UK government API utilities
-govuk-frontend-jinja>=2.8.0
+govuk-frontend-jinja>=3.8.0

 # Date and time for tax years
-python-dateutil>=2.8.0
+python-dateutil>=2.9.0

 # Retry mechanisms
-tenacity>=8.2.0
+tenacity>=9.1.2

 # Rate limiting
-ratelimit>=2.2.0
+ratelimit>=2.2.1

 # API validation
-marshmallow>=3.20.0
+marshmallow>=4.0.1

 # Encryption for sensitive data
-cryptography>=41.0.0
+cryptography>=46.0.2

 # Additional HTTP utilities
-urllib3>=2.1.0
+urllib3>=2.5.0
--- a/apps/svc_kg/requirements.txt
+++ b/apps/svc_kg/requirements.txt
@@ -1,22 +1,22 @@
 # Service-specific dependencies
 # RDF and semantic web
-rdflib>=7.0.0
-pyshacl>=0.25.0
+rdflib>=7.2.1
+pyshacl>=0.30.1

 # Graph algorithms
-networkx>=3.2.0
+networkx>=3.5

 # Data export formats
-xmltodict>=0.13.0
+xmltodict>=1.0.2

 # Query optimization
-pyparsing>=3.1.0
+pyparsing>=3.2.5

 # Graph visualization (optional)
-graphviz>=0.20.0
+graphviz>=0.21

 # Additional Neo4j utilities
-neomodel>=5.2.0
+neomodel>=5.5.3

 # Cypher query building
 py2neo>=2021.2.4
--- a/apps/svc_normalize_map/requirements.txt
+++ b/apps/svc_normalize_map/requirements.txt
@@ -1,37 +1,37 @@
 # FastAPI and server
-fastapi>=0.104.1
-uvicorn[standard]>=0.24.0
-pydantic>=2.5.0
+fastapi>=0.118.3
+uvicorn[standard]>=0.37.0
+pydantic>=2.12.0

 # Service-specific dependencies
 # Data normalization and cleaning
-pandas>=2.1.0
-numpy>=1.24.0
+pandas>=2.3.3
+numpy>=2.3.3

 # Currency and exchange rates
-forex-python>=1.8
-babel>=2.13.0
+forex-python>=1.9.2
+babel>=2.17.0

 # Date and time processing
-python-dateutil>=2.8.0
-pytz>=2023.3
+python-dateutil>=2.9.0
+pytz>=2025.2

 # Text normalization
-unidecode>=1.3.0
-phonenumbers>=8.13.0
+unidecode>=1.4.0
+phonenumbers>=9.0.16

 # Entity resolution and matching
 recordlinkage>=0.16.0
 fuzzywuzzy>=0.18.0
-python-Levenshtein>=0.23.0
+python-Levenshtein>=0.27.1

 # Geographic data
-geopy>=2.4.0
-pycountry>=23.12.0
+geopy>=2.4.1
+pycountry>=24.6.1

 # Data validation
-cerberus>=1.3.4
-marshmallow>=3.20.0
+cerberus>=1.3.7
+marshmallow>=4.0.1

 # UK-specific utilities
-uk-postcode-utils>=1.0.0
+uk-postcode-utils>=1.1
--- a/apps/svc_ocr/main.py
+++ b/apps/svc_ocr/main.py
@@ -1,17 +1,23 @@
 # FILE: apps/svc-ocr/main.py
 # OCR and layout extraction using Tesseract, LayoutLM, and document AI

+import asyncio
+import io
 import os

 # Import shared libraries
 import sys
 from datetime import datetime
-from typing import Any
+from typing import Any, cast

+import pytesseract
 import structlog
 import ulid
 from fastapi import BackgroundTasks, Depends, HTTPException, Request
 from fastapi.responses import JSONResponse
+from pdf2image import convert_from_bytes
+from PIL import Image
+from PyPDF2 import PdfReader

 sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))

@@ -19,6 +25,7 @@ from libs.app_factory import create_app
 from libs.config import BaseAppSettings, create_event_bus, create_minio_client
 from libs.events import EventBus, EventPayload, EventTopics
 from libs.observability import get_metrics, get_tracer, setup_observability
+from libs.ocr.processor import OCRProcessor
 from libs.schemas import ErrorResponse
 from libs.security import get_current_user, get_tenant_id
 from libs.storage import DocumentStorage, StorageClient
@@ -48,28 +55,31 @@ class OCRSettings(BaseAppSettings):
    include_coordinates: bool = True
    include_confidence: bool = True

+    # Vision/LLM OCR configuration
+    vision_provider: str = "ollama"  # or "openai"
+    vision_model: str = "llama3.2-vision:11b"
+    vision_format: str = (
+        "text"  # text | markdown | json | table | key_value | structured
+    )
+    vision_preprocess: bool = True
+    openai_base_url: str = "https://api.openai.com/v1/chat/completions"

-# Create app and settings
-app, settings = create_app(
-    service_name="svc-ocr",
-    title="Tax Agent OCR Service",
-    description="OCR and layout extraction service",
-    settings_class=OCRSettings,
-)  # fmt: skip

 # Global clients
 storage_client: StorageClient | None = None
 document_storage: DocumentStorage | None = None
 event_bus: EventBus | None = None
-tracer = get_tracer("svc-ocr")
-metrics = get_metrics()
+
+vision_processor: OCRProcessor | None = None
+# Settings will be initialized after app creation
+settings: OCRSettings


-@app.on_event("startup")
-async def startup_event() -> None:
+async def init_dependencies(app_settings: OCRSettings) -> None:
    """Initialize service dependencies"""
-    global storage_client, document_storage, event_bus
+    global storage_client, document_storage, event_bus, settings, vision_processor

+    settings = app_settings
    logger.info("Starting OCR service")

    # Setup observability
@@ -79,42 +89,44 @@ async def startup_event() -> None:
    minio_client = create_minio_client(settings)
    storage_client = StorageClient(minio_client)
    document_storage = DocumentStorage(storage_client)
-
    # Initialize event bus
    event_bus = create_event_bus(settings)
    if not event_bus:
        raise HTTPException(status_code=500, detail="Event bus not initialized")

-    await event_bus.start()
+    eb = event_bus
+    # mypy: event_bus is Optional, so use local alias after check
+    await eb.start()

    # Subscribe to document ingestion events
-    await event_bus.subscribe(EventTopics.DOC_INGESTED, _handle_document_ingested)
+    await eb.subscribe(EventTopics.DOC_INGESTED, _handle_document_ingested)
+
+    # Initialize shared OCRProcessor for vision strategy
+    try:
+        vision_processor = OCRProcessor(
+            model_name=settings.vision_model,
+            provider=settings.vision_provider,
+            openai_base_url=settings.openai_base_url,
+        )
+    except Exception as e:
+        logger.error("Failed to initialize vision OCR processor", error=str(e))

    logger.info("OCR service started successfully")


-@app.on_event("shutdown")
-async def shutdown_event() -> None:
-    """Cleanup service dependencies"""
-    global event_bus
+# Create app and settings
+app, _settings = create_app(
+    service_name="svc-ocr",
+    title="Tax Agent OCR Service",
+    description="OCR and layout extraction service",
+    settings_class=OCRSettings,
+)  # fmt: skip

-    logger.info("Shutting down OCR service")
+# Initialize dependencies immediately
+asyncio.run(init_dependencies(cast(OCRSettings, _settings)))

-    if event_bus:
-        await event_bus.stop()
-
-    logger.info("OCR service shutdown complete")
-
-
-@app.get("/health")
-async def health_check() -> dict[str, Any]:
-    """Health check endpoint"""
-    return {
-        "status": "healthy",
-        "service": settings.service_name,
-        "version": settings.service_version,
-        "timestamp": datetime.utcnow().isoformat(),
-    }
+tracer = get_tracer("svc-ocr")
+metrics = get_metrics()


@app.post("/process/{doc_id}")
@@ -132,9 +144,14 @@ async def process_document(
        span.set_attribute("tenant_id", tenant_id)
        span.set_attribute("strategy", strategy)

+        ds = document_storage
+        if ds is None:
+            raise HTTPException(
+                status_code=500, detail="Document storage not initialized"
+            )
        try:
            # Check if document exists
-            doc_content = await document_storage.get_document(tenant_id, doc_id)
+            doc_content = await ds.get_document(tenant_id, doc_id)
            if not doc_content:
                raise HTTPException(status_code=404, detail="Document not found")

@@ -142,9 +159,9 @@ async def process_document(
            processing_id = str(ulid.new())
            span.set_attribute("processing_id", processing_id)

-            # Start background processing
+            # Start background processing via sync wrapper (for mypy correctness)
            background_tasks.add_task(
-                _process_document_async,
+                _schedule_process_document_async,
                doc_id,
                tenant_id,
                doc_content,
@@ -168,7 +185,9 @@ async def process_document(
            raise
        except Exception as e:
            logger.error("Failed to start OCR processing", doc_id=doc_id, error=str(e))
-            raise HTTPException(status_code=500, detail="Failed to start processing")
+            raise HTTPException(
+                status_code=500, detail="Failed to start processing"
+            ) from e


@app.get("/results/{doc_id}")
@@ -183,9 +202,14 @@ async def get_ocr_results(
        span.set_attribute("doc_id", doc_id)
        span.set_attribute("tenant_id", tenant_id)

+        ds = document_storage
+        if ds is None:
+            raise HTTPException(
+                status_code=500, detail="Document storage not initialized"
+            )
        try:
            # Get OCR results from storage
-            ocr_results = await document_storage.get_ocr_result(tenant_id, doc_id)
+            ocr_results = await ds.get_ocr_result(tenant_id, doc_id)

            if not ocr_results:
                raise HTTPException(status_code=404, detail="OCR results not found")
@@ -196,26 +220,32 @@ async def get_ocr_results(
            raise
        except Exception as e:
            logger.error("Failed to get OCR results", doc_id=doc_id, error=str(e))
-            raise HTTPException(status_code=500, detail="Failed to get OCR results")
+            raise HTTPException(
+                status_code=500, detail="Failed to get OCR results"
+            ) from e


 async def _handle_document_ingested(topic: str, payload: EventPayload) -> None:
    """Handle document ingestion events"""
-    try:
-        data = payload.data
-        doc_id = data.get("doc_id")
-        tenant_id = data.get("tenant_id")
+    data = payload.data
+    doc_id = data.get("doc_id")
+    tenant_id = data.get("tenant_id")

-        if not doc_id or not tenant_id:
-            logger.warning("Invalid document ingestion event", data=data)
-            return
+    if not doc_id or not tenant_id:
+        logger.warning("Invalid document ingestion event", data=data)
+        return
+    ds = document_storage
+    if ds is None:
+        logger.error("Document storage not initialized")
+        return

-        # Auto-process PDF documents
-        if data.get("content_type") == "application/pdf":
-            logger.info("Auto-processing ingested document", doc_id=doc_id)
+    # Auto-process PDF documents
+    if data.get("content_type") == "application/pdf":
+        logger.info("Auto-processing ingested document", doc_id=doc_id)

+        try:
            # Get document content
-            doc_content = await document_storage.get_document(tenant_id, doc_id)
+            doc_content = await ds.get_document(tenant_id, doc_id)
            if doc_content:
                await _process_document_async(
                    doc_id=doc_id,
@@ -225,9 +255,10 @@ async def _handle_document_ingested(topic: str, payload: EventPayload) -> None:
                    processing_id=str(ulid.new()),
                    actor=payload.actor,
                )
-
-    except Exception as e:
-        logger.error("Failed to handle document ingestion", error=str(e))
+        except Exception as e:
+            logger.error(
+                "Failed to handle document ingestion", doc_id=doc_id, error=str(e)
+            )


 async def _process_document_async(
@@ -250,8 +281,8 @@ async def _process_document_async(
            images = await _pdf_to_images(content)

            # Process each page
-            pages_data: list[Any] = []
-            for page_num, image in enumerate(images, 1):
+            pages_data: list[dict[str, Any]] = []
+            for page_num, image in enumerate(images, 0):
                page_data = await _process_page(image, page_num, strategy)
                pages_data.append(page_data)

@@ -270,7 +301,10 @@ async def _process_document_async(
            }

            # Store results
-            await document_storage.store_ocr_result(tenant_id, doc_id, ocr_results)
+            ds = document_storage
+            if ds is None:
+                raise RuntimeError("Document storage not initialized")
+            await ds.store_ocr_result(tenant_id, doc_id, ocr_results)

            # Update metrics
            metrics.counter("documents_processed_total").labels(
@@ -282,7 +316,7 @@ async def _process_document_async(
            ).observe(
                datetime.utcnow().timestamp()
                - datetime.fromisoformat(
-                    ocr_results["processed_at"].replace("Z", "")
+                    ocr_results["processed_at"].replace("Z", "")  # type: ignore
                ).timestamp()
            )

@@ -300,7 +334,9 @@ async def _process_document_async(
                tenant_id=tenant_id,
            )

-            await event_bus.publish(EventTopics.DOC_OCR_READY, event_payload)
+            eb = event_bus
+            if eb is not None:
+                await eb.publish(EventTopics.DOC_OCR_READY, event_payload)

            logger.info(
                "OCR processing completed", doc_id=doc_id, pages=len(pages_data)
@@ -316,58 +352,91 @@ async def _process_document_async(


 async def _pdf_to_images(pdf_content: bytes) -> list[bytes]:
-    """Convert PDF to images"""
+    """Convert PDF to page images without PyMuPDF.
+
+    Primary: pdf2image (requires poppler). Fallback: extract largest embedded image per page via PyPDF2/Pillow.
+    """
+    # First try pdf2image for full-page rasterization
    try:
-        import fitz  # PyMuPDF
-
-        # Open PDF
-        pdf_doc = fitz.open(stream=pdf_content, filetype="pdf")
-
-        images: list[Any] = []
-        for page_num in range(min(len(pdf_doc), settings.max_pages)):
-            page = pdf_doc[page_num]
-
-            # Render page to image
-            mat = fitz.Matrix(2.0, 2.0)  # 2x zoom for better OCR
-            pix = page.get_pixmap(matrix=mat)
-            img_data = pix.tobytes("png")
-
-            images.append(img_data)
-
-        pdf_doc.close()
-        return images
-
-    except ImportError:
-        logger.error("PyMuPDF not available, using fallback")
-        return await _pdf_to_images_fallback(pdf_content)
-    except Exception as e:
-        logger.error("PDF conversion failed", error=str(e))
-        raise
-
-
-async def _pdf_to_images_fallback(pdf_content: bytes) -> list[bytes]:
-    """Fallback PDF to images conversion"""
-    try:
-        from pdf2image import convert_from_bytes
-
        images = convert_from_bytes(
            pdf_content, dpi=200, first_page=1, last_page=settings.max_pages
        )
-
-        # Convert PIL images to bytes
-        image_bytes: list[Any] = []
+        image_bytes: list[bytes] = []
        for img in images:
-            import io
-
            img_buffer = io.BytesIO()
            img.save(img_buffer, format="PNG")
            image_bytes.append(img_buffer.getvalue())
-
        return image_bytes
+    except Exception as e:
+        logger.warning(
+            "pdf2image conversion failed; falling back to PyPDF2", error=str(e)
+        )

-    except ImportError:
-        logger.error("pdf2image not available")
-        raise Exception("No PDF conversion library available")
+    # Fallback: extract largest embedded image per page using PyPDF2
+    try:
+        reader = PdfReader(io.BytesIO(pdf_content))
+        out_images: list[bytes] = []
+        for page_index, page in enumerate(reader.pages):
+            if page_index >= settings.max_pages:
+                break
+            try:
+                resources = page.get("/Resources")
+                if resources is None:
+                    continue
+                xobject = resources.get("/XObject")
+                if xobject is None:
+                    continue
+                xobject = xobject.get_object()
+
+                largest = None
+                largest_area = -1
+                for _, obj_ref in xobject.items():
+                    try:
+                        obj = obj_ref.get_object()
+                        if obj.get("/Subtype") != "/Image":
+                            continue
+                        width = int(obj.get("/Width", 0))
+                        height = int(obj.get("/Height", 0))
+                        area = width * height
+                        if area > largest_area:
+                            largest = obj
+                            largest_area = area
+                    except Exception:
+                        continue
+
+                if largest is None:
+                    continue
+
+                data = largest.get_data()
+                filt = largest.get("/Filter")
+
+                if filt in ("/DCTDecode", "/JPXDecode"):
+                    # JPEG or JPEG2000
+                    out_images.append(data)
+                else:
+                    # Flate or other; decode via Pillow
+                    mode = "RGB"
+                    colorspace = largest.get("/ColorSpace")
+                    if colorspace in ("/DeviceGray",):
+                        mode = "L"
+                    width = int(largest.get("/Width", 0))
+                    height = int(largest.get("/Height", 0))
+                    try:
+                        img = Image.frombytes(mode, (width, height), data)
+                    except Exception:
+                        img = Image.open(io.BytesIO(data))
+                    buf = io.BytesIO()
+                    img.save(buf, format="PNG")
+                    out_images.append(buf.getvalue())
+            except Exception:
+                continue
+
+        if not out_images:
+            raise RuntimeError("No images extracted via PyPDF2 fallback")
+        return out_images
+    except Exception as fallback_e:
+        logger.error("PDF conversion failed (both methods)", error=str(fallback_e))
+        raise


 async def _process_page(
@@ -395,6 +464,8 @@ async def _process_page(
                layoutlm_result.get("confidence", 0),
            ),
        }
+    elif strategy == "vision":
+        return await _process_with_vision(image_data, page_num)
    else:
        raise ValueError(f"Unknown strategy: {strategy}")

@@ -402,11 +473,6 @@ async def _process_page(
 async def _process_with_tesseract(image_data: bytes, page_num: int) -> dict[str, Any]:
    """Process page with Tesseract OCR"""
    try:
-        import io
-
-        import pytesseract
-        from PIL import Image
-
        # Load image
        image = Image.open(io.BytesIO(image_data))

@@ -414,13 +480,13 @@ async def _process_with_tesseract(image_data: bytes, page_num: int) -> dict[str,
        config = f"{settings.tesseract_config} -l {settings.languages}"

        # Extract text with confidence
-        data = pytesseract.image_to_data(
+        data = pytesseract.image_to_data(  # type: ignore
            image, config=config, output_type=pytesseract.Output.DICT
        )

        # Process results
-        words: list[Any] = []
-        confidences: list[Any] = []
+        words: list[dict[str, Any]] = []
+        confidences: list[float] = []

        for i in range(len(data["text"])):
            if int(data["conf"][i]) > 0:  # Valid confidence
@@ -449,13 +515,6 @@ async def _process_with_tesseract(image_data: bytes, page_num: int) -> dict[str,
            "word_count": len(words),
        }

-    except ImportError:
-        logger.error("pytesseract not available")
-        return {
-            "page": page_num,
-            "strategy": "tesseract",
-            "error": "pytesseract not available",
-        }
    except Exception as e:
        logger.error("Tesseract processing failed", page=page_num, error=str(e))
        return {"page": page_num, "strategy": "tesseract", "error": str(e)}
@@ -482,6 +541,68 @@ async def _process_with_layoutlm(image_data: bytes, page_num: int) -> dict[str,
        return {"page": page_num, "strategy": "layoutlm", "error": str(e)}


+async def _process_with_vision(image_data: bytes, page_num: int) -> dict[str, Any]:
+    """Process page with LLM vision OCR via shared OCRProcessor"""
+    try:
+        vp = vision_processor
+        if vp is None:
+            raise RuntimeError("Vision OCR processor not initialized")
+
+        # Persist the page image temporarily for the processor API
+        import tempfile
+
+        with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
+            tmp.write(image_data)
+            tmp_path = tmp.name
+
+        try:
+            text = vp.process_image(
+                image_path=tmp_path,
+                format_type=settings.vision_format,
+                preprocess=settings.vision_preprocess,
+                language=settings.languages,
+            )
+        finally:
+            try:
+                os.remove(tmp_path)
+            except OSError:
+                pass
+
+        return {
+            "page": page_num,
+            "strategy": "vision",
+            "text": text if isinstance(text, str) else str(text),
+            "confidence": 0.0,  # Not provided by LLM API
+        }
+    except Exception as e:
+        logger.error("Vision processing failed", page=page_num, error=str(e))
+        return {"page": page_num, "strategy": "vision", "error": str(e)}
+
+
+def _schedule_process_document_async(
+    doc_id: str,
+    tenant_id: str,
+    content: bytes,
+    strategy: str,
+    processing_id: str,
+    actor: str,
+) -> None:
+    """Sync wrapper to schedule the async OCR task.
+
+    This keeps FastAPI BackgroundTasks type expectations satisfied under mypy strict.
+    """
+    asyncio.create_task(
+        _process_document_async(
+            doc_id=doc_id,
+            tenant_id=tenant_id,
+            content=content,
+            strategy=strategy,
+            processing_id=processing_id,
+            actor=actor,
+        )
+    )
+
+
@app.exception_handler(HTTPException)
 async def http_exception_handler(request: Request, exc: HTTPException) -> JSONResponse:
    """Handle HTTP exceptions with RFC7807 format"""
--- a/apps/svc_ocr/requirements.txt
+++ b/apps/svc_ocr/requirements.txt
@@ -5,7 +5,7 @@
 pytesseract>=0.3.13

 # PDF processing
-PyMuPDF>=1.26.4
+PyPDF2>=3.0.1
 pdf2image>=1.17.0

 # Image processing
--- a/apps/svc_rag_indexer/requirements.txt
+++ b/apps/svc_rag_indexer/requirements.txt
@@ -2,7 +2,7 @@
 # NOTE: ML dependencies (sentence-transformers, transformers, torch, numpy) are in base-ml image

 # Text chunking (lightweight alternative to langchain)
-tiktoken>=0.11.0
+tiktoken>=0.12.0

 # Text preprocessing (lightweight)
 beautifulsoup4>=4.14.2
--- a/apps/svc_reason/requirements.txt
+++ b/apps/svc_reason/requirements.txt
@@ -1,20 +1,20 @@
 # FastAPI and server
-fastapi>=0.104.1
-uvicorn[standard]>=0.24.0
-pydantic>=2.5.0
+fastapi>=0.118.3
+uvicorn[standard]>=0.37.0
+pydantic>=2.12.0

 # Service-specific dependencies
 # Mathematical calculations
 # decimal is part of Python standard library
-sympy>=1.12.0
+sympy>=1.14.0

 # Tax calculations
 numpy>=2.3.3
-pandas>=2.1.0
+pandas>=2.3.3

 # Date and time calculations
-python-dateutil>=2.8.0
-pytz>=2023.3
+python-dateutil>=2.9.0
+pytz>=2025.2

 # UK tax specific
 # uk-tax-calculator>=1.0.0  # Package may not exist, commenting out
@@ -26,10 +26,10 @@ pytz>=2023.3
 # quantlib>=1.32.0  # Package may not exist, commenting out

 # Data validation
-cerberus>=1.3.4
+cerberus>=1.3.7

 # Template processing for explanations
-jinja2>=3.1.0
+jinja2>=3.1.6

 # Statistical calculations
-scipy>=1.11.0
+scipy>=1.16.2
--- a/apps/svc_rpa/requirements.txt
+++ b/apps/svc_rpa/requirements.txt
@@ -1,11 +1,11 @@
 # FastAPI and server
-fastapi>=0.104.1
-uvicorn[standard]>=0.24.0
-pydantic>=2.5.0
+fastapi>=0.118.3
+uvicorn[standard]>=0.37.0
+pydantic>=2.12.0

 # Service-specific dependencies
 # Browser automation
-playwright>=1.40.0
+playwright>=1.55.0

 # Additional async utilities
 # asyncio-timeout>=4.0.3  # Deprecated, use asyncio.timeout from Python 3.11+ standard library
@@ -14,4 +14,4 @@ playwright>=1.40.0
 aioredis>=2.0.1

 # Browser management
-psutil>=5.9.0
+psutil>=7.1.0
--- a/docs/DEPLOYMENT_PLAN.md
+++ b/docs/DEPLOYMENT_PLAN.md
@@ -7,6 +7,7 @@ This plan outlines the strategy to host both the **AI Tax Agent application** an
 ## Current State Analysis

 ### Remote Server (`141.136.35.199`)
+
 - **Location**: `/opt/compose/`
 - **Existing Services**:
  - Traefik v3.5.1 (reverse proxy with GoDaddy DNS challenge)
@@ -25,6 +26,7 @@ This plan outlines the strategy to host both the **AI Tax Agent application** an
  - `portainer.harkon.co.uk`

 ### Local Repository (`infra/compose/`)
+
 - **Compose Files**:
  - `docker-compose.local.yml` - Full stack for local development
  - `docker-compose.backend.yml` - Backend services (appears to be production-ready)
@@ -39,25 +41,30 @@ This plan outlines the strategy to host both the **AI Tax Agent application** an
 ## Challenges & Conflicts

 ### 1. **Duplicate Services**
+
 - Both environments have Traefik and Authentik
 - Need to decide: shared vs. isolated

 ### 2. **Network Naming**
+
 - Remote: `frontend`, `backend`
 - Local: `ai-tax-agent-frontend`, `ai-tax-agent-backend`
 - Production needs: Consistent naming

 ### 3. **Domain Management**
+
 - Remote: `*.harkon.co.uk` (public)
 - Local: `*.local.lan` (development)
 - Production: Need subdomains like `app.harkon.co.uk`, `api.harkon.co.uk`

 ### 4. **SSL Certificates**
+
 - Remote: GoDaddy DNS challenge (production)
 - Local: Self-signed certificates
 - Production: Must use GoDaddy DNS challenge

 ### 5. **Resource Isolation**
+
 - Company services need to remain stable
 - Application services need independent deployment/rollback

@@ -66,6 +73,7 @@ This plan outlines the strategy to host both the **AI Tax Agent application** an
 We will deploy the company services and the AI Tax Agent as two fully isolated stacks, each with its own Traefik and Authentik. This maximizes blast-radius isolation and avoids naming and DNS conflicts across environments.

 Key implications:
+
 - Separate external networks and DNS namespaces per stack
 - Duplicate edge (Traefik) and IdP (Authentik), independent upgrades and rollbacks
 - Slightly higher resource usage in exchange for strong isolation
@@ -139,6 +147,7 @@ Key implications:
 ### Domain Mapping

 **Company Services** (existing):
+
 - `traefik.harkon.co.uk` - Traefik dashboard
 - `auth.harkon.co.uk` - Authentik SSO
 - `gitea.harkon.co.uk` - Git hosting
@@ -146,6 +155,7 @@ Key implications:
 - `portainer.harkon.co.uk` - Docker management

 **Application Services** (app stack):
+
 - `review.<domain>` - Review UI
 - `api.<domain>` - API Gateway (microservices via Traefik)
 - `vault.<domain>` - Vault UI (admin only)
@@ -159,12 +169,14 @@ Key implications:
 ### Authentication Strategy

 **Authentik Configuration**:
+
 1. **Company Group** - Access to Gitea, Nextcloud, Portainer
 2. **App Admin Group** - Full access to all app services
 3. **App User Group** - Access to Review UI and API
 4. **App Reviewer Group** - Access to Review UI only

 **Middleware Configuration**:
+
 - `authentik-forwardauth` - Standard auth for all services
 - `admin-auth` - Requires admin group (Vault, MinIO, Neo4j, etc.)
 - `reviewer-auth` - Requires reviewer or higher
@@ -182,6 +194,7 @@ Key implications:
 ### Development Environment

 **Keep Existing Setup**:
+
 - Use `docker-compose.local.yml` as-is
 - Domain: `*.local.lan`
 - Self-signed certificates
@@ -189,6 +202,7 @@ Key implications:
 - Full stack runs locally

 **Benefits**:
+
 - No dependency on remote server
 - Fast iteration
 - Complete isolation
@@ -217,19 +231,22 @@ make deploy-production  # Deploy to remote server
 ### Phase 1: Preparation (Week 1)

 1. **Backup Current State**
+
   ```bash
   ssh deploy@141.136.35.199
-   cd /opt/compose
+   cd /opt
   tar -czf ~/backup-$(date +%Y%m%d).tar.gz .
   ```

 2. **Create Production Environment File**
-   - Copy `infra/compose/env.example` to `infra/compose/.env.production`
+
+   - Copy `infra/environments/production/.env.example` to `infra/environments/production/.env`
   - Update all secrets and passwords
   - Set `DOMAIN=harkon.co.uk`
   - Configure GoDaddy API credentials

 3. **Update Traefik Configuration**
+
   - Merge local Traefik config with remote
   - Add application routes
   - Configure Authentik ForwardAuth
@@ -242,13 +259,15 @@ make deploy-production  # Deploy to remote server
 ### Phase 2: Infrastructure Deployment (Week 2)

 1. **Deploy Application Infrastructure**
+
   ```bash
   # On remote server
-   cd /opt/compose/ai-tax-agent
+   cd /opt/ai-tax-agent
   docker compose -f infrastructure.yaml up -d
   ```

 2. **Initialize Services**
+
   - Vault: Unseal and configure
   - Postgres: Run migrations
   - Neo4j: Install plugins
@@ -262,11 +281,13 @@ make deploy-production  # Deploy to remote server
 ### Phase 3: Application Deployment (Week 3)

 1. **Deploy Microservices**
+
   ```bash
   docker compose -f services.yaml up -d
   ```

 2. **Deploy Monitoring**
+
   ```bash
   docker compose -f monitoring.yaml up -d
   ```
--- a/docs/DEPLOYMENT_PROGRESS.md
+++ b/docs/DEPLOYMENT_PROGRESS.md
@@ -10,7 +10,7 @@

 ### 1. Production Compose Files Created

-Created three production-ready Docker Compose files in `infra/compose/production/`:
+Created three production-ready Docker Compose files in `infra/base/`:

 #### **infrastructure.yaml**
 - Vault (secrets management)
@@ -104,7 +104,7 @@ chmod +x scripts/deploy-to-production.sh

 ### 3. Documentation Created

-#### **infra/compose/production/README.md**
+#### **infra/base manifests**
 Comprehensive production deployment guide including:
 - Prerequisites checklist
 - Three deployment options (automated, step-by-step, manual)
@@ -221,7 +221,7 @@ Or step-by-step:
 1. **Initialize Vault**
   ```bash
   ssh deploy@141.136.35.199
-   cd /opt/compose/ai-tax-agent
+   cd /opt/ai-tax-agent
   docker exec -it vault vault operator init
   # Save unseal keys!
   docker exec -it vault vault operator unseal
@@ -382,7 +382,6 @@ Deployment is successful when:
 If you encounter issues:
 1. Check logs: `./scripts/deploy-to-production.sh logs <service>`
 2. Verify status: `./scripts/deploy-to-production.sh verify`
-3. Review documentation: `infra/compose/production/README.md`
+3. Review manifests: `infra/base/*.yaml`
 4. Check deployment plan: `docs/DEPLOYMENT_PLAN.md`
 5. Follow checklist: `docs/DEPLOYMENT_CHECKLIST.md`
-
--- a/docs/DEPLOYMENT_STATUS.md
+++ b/docs/DEPLOYMENT_STATUS.md
@@ -21,15 +21,14 @@
 - ✅ Created quick start guide (`docs/QUICK_START.md`)

 ### 3. Production Configuration Files
- ✅ Created `infra/compose/production/infrastructure.yaml` (7 infrastructure services)
- ✅ Created `infra/compose/production/services.yaml` (14 application services + UI)
- ✅ Created `infra/compose/production/monitoring.yaml` (Prometheus, Grafana, Loki, Promtail)
- ✅ Created `infra/compose/production/README.md` (deployment guide)
+- ✅ Created `infra/base/infrastructure.yaml` (infrastructure, incl. Traefik + Authentik)
+- ✅ Created `infra/base/services.yaml` (application services + UI)
+- ✅ Created `infra/base/monitoring.yaml` (Prometheus, Grafana, Loki, Promtail)

 ### 4. Monitoring Configuration
- ✅ Created Prometheus configuration (`infra/compose/prometheus/prometheus.yml`)
- ✅ Created Loki configuration (`infra/compose/loki/loki-config.yml`)
- ✅ Created Promtail configuration (`infra/compose/promtail/promtail-config.yml`)
+- ✅ Created Prometheus configuration (`infra/base/prometheus/prometheus.yml`)
+- ✅ Created Loki configuration (`infra/base/loki/loki-config.yml`)
+- ✅ Created Promtail configuration (`infra/base/promtail/promtail-config.yml`)
 - ✅ Configured service discovery for all 14 services
 - ✅ Set up 30-day metrics retention

@@ -266,10 +265,9 @@ df -h
   - `docs/ENVIRONMENT_COMPARISON.md` - Local vs Production comparison

 2. **Configuration:**
-   - `infra/compose/production/README.md` - Production compose guide
-   - `infra/compose/production/infrastructure.yaml` - Infrastructure services
-   - `infra/compose/production/services.yaml` - Application services
-   - `infra/compose/production/monitoring.yaml` - Monitoring stack
+   - `infra/base/infrastructure.yaml` - Infrastructure services
+   - `infra/base/services.yaml` - Application services
+   - `infra/base/monitoring.yaml` - Monitoring stack

 3. **Deployment:**
   - `docs/POST_BUILD_DEPLOYMENT.md` - Post-build deployment steps
@@ -319,4 +317,3 @@ For questions or issues:
 - 🟡 In Progress
 - ⏳ Pending
 - ❌ Blocked
-
--- a/docs/ENVIRONMENT_COMPARISON.md
+++ b/docs/ENVIRONMENT_COMPARISON.md
@@ -12,7 +12,7 @@ This document compares the local development environment with the production env
 | **SSL** | Self-signed certificates | Let's Encrypt (GoDaddy DNS) |
 | **Networks** | `ai-tax-agent-frontend`<br/>`ai-tax-agent-backend` | `frontend`<br/>`backend` |
 | **Compose File** | `docker-compose.local.yml` | `infrastructure.yaml`<br/>`services.yaml`<br/>`monitoring.yaml` |
-| **Location** | Local machine | `deploy@141.136.35.199:/opt/compose/ai-tax-agent/` |
+| **Location** | Local machine | `deploy@141.136.35.199:/opt/ai-tax-agent/` |
 | **Traefik** | Isolated instance | Shared with company services |
 | **Authentik** | Isolated instance | Shared with company services |
 | **Data Persistence** | Local Docker volumes | Remote Docker volumes + backups |
@@ -271,7 +271,7 @@ make clean
 #### Production
 ```bash
 # Deploy infrastructure
-cd /opt/compose/ai-tax-agent
+cd /opt/ai-tax-agent
 docker compose -f infrastructure.yaml up -d

 # Deploy services
@@ -370,7 +370,7 @@ docker compose -f services.yaml up -d --no-deps svc-ingestion
 4. **Deploy to production**:
   ```bash
   ssh deploy@141.136.35.199
-   cd /opt/compose/ai-tax-agent
+   cd /opt/ai-tax-agent
   docker compose -f services.yaml pull
   docker compose -f services.yaml up -d
   ```
@@ -436,4 +436,3 @@ The key differences between local and production environments are:
 6. **Backups**: Local has none; production has automated backups

 Both environments use the same application code and Docker images, ensuring consistency and reducing deployment risks.
-
--- a/docs/GITEA_REGISTRY_DEBUG.md
+++ b/docs/GITEA_REGISTRY_DEBUG.md
@@ -1,332 +0,0 @@
-# Gitea Container Registry Debugging Guide
-
-## Common Issues When Pushing Large Docker Images
-
-### Issue 1: Not Logged In
-
-**Symptom**: `unauthorized: authentication required`
-
-**Solution**:
-```bash
-# On remote server
-docker login gitea.harkon.co.uk
-# Username: blue (or your Gitea username)
-# Password: <your-gitea-access-token>
-```
-
---
-
-### Issue 2: Upload Size Limit (413 Request Entity Too Large)
-
-**Symptom**: Push fails with `413 Request Entity Too Large` or similar error
-
-**Root Cause**: Traefik or Gitea has a limit on request body size
-
-**Solution A: Configure Traefik Middleware**
-
-1. Find your Traefik configuration directory:
-```bash
-docker inspect traefik | grep -A 10 Mounts
-```
-
-2. Create middleware configuration:
-```bash
-# Example: /opt/traefik/config/middlewares.yml
-sudo tee /opt/traefik/config/middlewares.yml > /dev/null << 'EOF'
-http:
-  middlewares:
-    large-upload:
-      buffering:
-        maxRequestBodyBytes: 5368709120  # 5GB
-        memRequestBodyBytes: 104857600   # 100MB
-        maxResponseBodyBytes: 5368709120 # 5GB
-        memResponseBodyBytes: 104857600  # 100MB
-EOF
-```
-
-3. Update Gitea container labels:
-```yaml
-labels:
-  - "traefik.http.routers.gitea.middlewares=large-upload@file"
-```
-
-4. Restart Traefik:
-```bash
-docker restart traefik
-```
-
-**Solution B: Configure Gitea Directly**
-
-1. Edit Gitea configuration:
-```bash
-docker exec -it gitea-server vi /data/gitea/conf/app.ini
-```
-
-2. Add/modify these settings:
-```ini
-[server]
-LFS_MAX_FILE_SIZE = 5368709120  ; 5GB
-
-[repository.upload]
-FILE_MAX_SIZE = 5368709120  ; 5GB
-```
-
-3. Restart Gitea:
-```bash
-docker restart gitea-server
-```
-
---
-
-### Issue 3: Network Timeout
-
-**Symptom**: Push hangs or times out after uploading for a while
-
-**Root Cause**: Network instability or slow connection
-
-**Solution**: Use chunked uploads or increase timeout
-
-1. Configure Docker daemon timeout:
-```bash
-# Edit /etc/docker/daemon.json
-sudo tee /etc/docker/daemon.json > /dev/null << 'EOF'
-{
-  "max-concurrent-uploads": 1,
-  "max-concurrent-downloads": 3,
-  "registry-mirrors": []
-}
-EOF
-
-sudo systemctl restart docker
-```
-
-2. Or use Traefik timeout middleware:
-```yaml
-http:
-  middlewares:
-    long-timeout:
-      buffering:
-        retryExpression: "IsNetworkError() && Attempts() < 3"
-```
-
---
-
-### Issue 4: Disk Space
-
-**Symptom**: Push fails with "no space left on device"
-
-**Solution**:
-```bash
-# Check disk space
-df -h
-
-# Clean up Docker
-docker system prune -a --volumes -f
-
-# Check again
-df -h
-```
-
---
-
-### Issue 5: Gitea Registry Not Enabled
-
-**Symptom**: `404 Not Found` when accessing `/v2/`
-
-**Solution**:
-```bash
-# Check if registry is enabled
-docker exec gitea-server cat /data/gitea/conf/app.ini | grep -A 5 "\[packages\]"
-
-# Should show:
-# [packages]
-# ENABLED = true
-```
-
-If not enabled, add to `app.ini`:
-```ini
-[packages]
-ENABLED = true
-```
-
-Restart Gitea:
-```bash
-docker restart gitea-server
-```
-
---
-
-## Debugging Steps
-
-### Step 1: Verify Gitea Registry is Accessible
-
-```bash
-# Should return 401 Unauthorized (which is good - means registry is working)
-curl -I https://gitea.harkon.co.uk/v2/
-
-# Should return 200 OK after login
-docker login gitea.harkon.co.uk
-curl -u "username:token" https://gitea.harkon.co.uk/v2/
-```
-
-### Step 2: Test with Small Image
-
-```bash
-# Pull a small image
-docker pull alpine:latest
-
-# Tag it for your registry
-docker tag alpine:latest gitea.harkon.co.uk/harkon/test:latest
-
-# Try to push
-docker push gitea.harkon.co.uk/harkon/test:latest
-```
-
-If this works, the issue is with large images (size limit).
-
-### Step 3: Check Gitea Logs
-
-```bash
-# Check for errors
-docker logs gitea-server --tail 100 | grep -i error
-
-# Watch logs in real-time while pushing
-docker logs -f gitea-server
-```
-
-### Step 4: Check Traefik Logs
-
-```bash
-# Check for 413 or 502 errors
-docker logs traefik --tail 100 | grep -E "413|502|error"
-
-# Watch logs in real-time
-docker logs -f traefik
-```
-
-### Step 5: Check Docker Daemon Logs
-
-```bash
-# Check Docker daemon logs
-sudo journalctl -u docker --since "1 hour ago" | grep -i error
-```
-
---
-
-## Quick Fix: Bypass Traefik for Registry
-
-If Traefik is causing issues, you can expose Gitea's registry directly:
-
-1. Update Gitea docker-compose to expose port 3000:
-```yaml
-services:
-  gitea:
-    ports:
-      - "3000:3000"  # HTTP
-```
-
-2. Use direct connection:
-```bash
-docker login gitea.harkon.co.uk:3000
-docker push gitea.harkon.co.uk:3000/harkon/base-ml:v1.0.1
-```
-
-**Note**: This bypasses SSL, so only use for debugging!
-
---
-
-## Recommended Configuration for Large Images
-
-### Traefik Configuration
-
-Create `/opt/traefik/config/gitea-registry.yml`:
-
-```yaml
-http:
-  middlewares:
-    gitea-registry:
-      buffering:
-        maxRequestBodyBytes: 5368709120   # 5GB
-        memRequestBodyBytes: 104857600    # 100MB in memory
-        maxResponseBodyBytes: 5368709120  # 5GB
-        memResponseBodyBytes: 104857600   # 100MB in memory
-      
-  routers:
-    gitea-registry:
-      rule: "Host(`gitea.harkon.co.uk`) && PathPrefix(`/v2/`)"
-      entryPoints:
-        - websecure
-      middlewares:
-        - gitea-registry
-      service: gitea
-      tls:
-        certResolver: letsencrypt
-```
-
-### Gitea Configuration
-
-In `/data/gitea/conf/app.ini`:
-
-```ini
-[server]
-PROTOCOL = http
-DOMAIN = gitea.harkon.co.uk
-ROOT_URL = https://gitea.harkon.co.uk/
-HTTP_PORT = 3000
-LFS_MAX_FILE_SIZE = 5368709120
-
-[repository.upload]
-FILE_MAX_SIZE = 5368709120
-ENABLED = true
-
-[packages]
-ENABLED = true
-CHUNKED_UPLOAD_PATH = /data/gitea/tmp/package-upload
-```
-
---
-
-## Testing the Fix
-
-After applying configuration changes:
-
-1. Restart services:
-```bash
-docker restart traefik
-docker restart gitea-server
-```
-
-2. Test with a large layer:
-```bash
-# Build base-ml (has large layers)
-cd /home/deploy/ai-tax-agent
-docker build -f infra/docker/base-ml.Dockerfile -t gitea.harkon.co.uk/harkon/base-ml:test .
-
-# Try to push
-docker push gitea.harkon.co.uk/harkon/base-ml:test
-```
-
-3. Monitor logs:
-```bash
-# Terminal 1: Watch Traefik
-docker logs -f traefik
-
-# Terminal 2: Watch Gitea
-docker logs -f gitea-server
-
-# Terminal 3: Push image
-docker push gitea.harkon.co.uk/harkon/base-ml:test
-```
-
---
-
-## Alternative: Use Docker Hub or GitHub Container Registry
-
-If Gitea continues to have issues with large images, consider:
-
-1. **Docker Hub**: Free for public images
-2. **GitHub Container Registry (ghcr.io)**: Free for public/private
-3. **GitLab Container Registry**: Free tier available
-
-These are battle-tested for large ML images and have better defaults for large uploads.
-
--- a/docs/GITEA_REGISTRY_FIX.md
+++ b/docs/GITEA_REGISTRY_FIX.md
@@ -1,194 +0,0 @@
-# Gitea Container Registry - Image Naming Fix
-
-## Issue
-
-The initial build script was using incorrect image naming convention for Gitea's container registry.
-
-### Incorrect Format
-
-```
-gitea.harkon.co.uk/ai-tax-agent/svc-ingestion:v1.0.0
-```
-
-### Correct Format (Per Gitea Documentation)
-
-```
-gitea.harkon.co.uk/{owner}/{image}:{tag}
-```
-
-Where `{owner}` must be your **Gitea username** or **organization name**.
-
-**Using organization:** `harkon` (Gitea team/organization)
-
-## Solution
-
-Updated the build script and production compose files to use the correct naming convention.
-
-### Changes Made
-
-#### 1. Build Script (`scripts/build-and-push-images.sh`)
-
-**Before:**
-
-```bash
-REGISTRY="${1:-gitea.harkon.co.uk}"
-VERSION="${2:-latest}"
-PROJECT="ai-tax-agent"
-
-IMAGE_NAME="$REGISTRY/$PROJECT/$service:$VERSION"
-```
-
-**After:**
-
-```bash
-REGISTRY="${1:-gitea.harkon.co.uk}"
-VERSION="${2:-latest}"
-OWNER="${3:-harkon}"  # Gitea organization/team name
-
-IMAGE_NAME="$REGISTRY/$OWNER/$service:$VERSION"
-```
-
-#### 2. Production Services (`infra/compose/production/services.yaml`)
-
-**Before:**
-
-```yaml
-svc-ingestion:
-  image: gitea.harkon.co.uk/ai-tax-agent/svc-ingestion:latest
-```
-
-**After:**
-
-```yaml
-svc-ingestion:
-  image: gitea.harkon.co.uk/harkon/svc-ingestion:latest
-```
-
-All 14 services updated:
-
- svc-ingestion
- svc-extract
- svc-kg
- svc-rag-retriever
- svc-rag-indexer
- svc-forms
- svc-hmrc
- svc-ocr
- svc-rpa
- svc-normalize-map
- svc-reason
- svc-firm-connectors
- svc-coverage
- ui-review
-
-## Usage
-
-### Build and Push Images
-
-```bash
-# With default owner (harkon organization)
-./scripts/build-and-push-images.sh gitea.harkon.co.uk v1.0.1
-
-# With custom owner
-./scripts/build-and-push-images.sh gitea.harkon.co.uk v1.0.1 <your-gitea-org>
-```
-
-### Pull Images
-
-```bash
-docker pull gitea.harkon.co.uk/harkon/svc-ingestion:v1.0.1
-```
-
-### Push Images Manually
-
-```bash
-# Tag image
-docker tag my-image:latest gitea.harkon.co.uk/harkon/my-image:v1.0.1
-
-# Push image
-docker push gitea.harkon.co.uk/harkon/my-image:v1.0.1
-```
-
-## Gitea Registry Documentation Reference
-
-From Gitea's official documentation:
-
-### Image Naming Convention
-
-Images must follow this naming convention:
-
-```
-{registry}/{owner}/{image}
-```
-
-When building your docker image, using the naming convention above, this looks like:
-
-```bash
-# build an image with tag
-docker build -t {registry}/{owner}/{image}:{tag} .
-
-# name an existing image with tag
-docker tag {some-existing-image}:{tag} {registry}/{owner}/{image}:{tag}
-```
-
-### Valid Examples
-
-For owner `testuser` on `gitea.example.com`:
-
- ✅ `gitea.example.com/testuser/myimage`
- ✅ `gitea.example.com/testuser/my-image`
- ✅ `gitea.example.com/testuser/my/image`
-
-### Important Notes
-
-1. **Owner must exist**: The owner (username or organization) must exist in Gitea
-2. **Case-insensitive tags**: `image:tag` and `image:Tag` are treated as the same
-3. **Authentication required**: Use personal access token with `write:package` scope
-4. **Registry URL**: Use the main Gitea domain, not a separate registry subdomain
-
-## Verification
-
-After the fix, verify images are pushed correctly:
-
-```bash
-# Login to Gitea
-docker login gitea.harkon.co.uk
-
-# Check pushed images in Gitea UI
-# Navigate to: https://gitea.harkon.co.uk/blue/-/packages
-```
-
-## Current Build Status
-
-✅ **Fixed and working!**
-
-Build command:
-
-```bash
-./scripts/build-and-push-images.sh gitea.harkon.co.uk v1.0.1 harkon
-```
-
-Expected output:
-
-```
-ℹ️  Logging in to registry: gitea.harkon.co.uk
-Login Succeeded
-ℹ️  Building svc-ingestion...
-ℹ️  Building: gitea.harkon.co.uk/harkon/svc-ingestion:v1.0.1
-✅ Built: gitea.harkon.co.uk/harkon/svc-ingestion:v1.0.1
-ℹ️  Pushing: gitea.harkon.co.uk/harkon/svc-ingestion:v1.0.1
-✅ Pushed: gitea.harkon.co.uk/harkon/svc-ingestion:v1.0.1
-```
-
-## Next Steps
-
-1. ✅ Build script fixed
-2. ✅ Production compose files updated
-3. 🟡 Build in progress (14 services)
-4. ⏳ Deploy to production (after build completes)
-
-## References
-
- [Gitea Container Registry Documentation](https://docs.gitea.com/usage/packages/container)
- Build script: `scripts/build-and-push-images.sh`
- Production services: `infra/compose/production/services.yaml`
--- a/docs/OPTIMIZATION_SUMMARY.md
+++ b/docs/OPTIMIZATION_SUMMARY.md
@@ -148,11 +148,11 @@ docker run --rm gitea.harkon.co.uk/harkon/svc-ocr:v1.0.1 pip list | grep torch

 ### 5. Update Production Deployment

-Update `infra/compose/production/services.yaml` to use `v1.0.1`:
+Update `infra/base/services.yaml` to use `v1.0.1`:

 ```bash
 # Find and replace v1.0.0 with v1.0.1
-sed -i '' 's/:v1.0.0/:v1.0.1/g' infra/compose/production/services.yaml
+sed -i '' 's/:v1.0.0/:v1.0.1/g' infra/base/services.yaml

 # Or use latest tag (already configured)
 # No changes needed if using :latest
--- a/docs/QUICK_START.md
+++ b/docs/QUICK_START.md
@@ -50,7 +50,7 @@ docker login gitea.harkon.co.uk
 **SSH to server:**
 ```bash
 ssh deploy@141.136.35.199
-cd /opt/compose/ai-tax-agent
+cd /opt/ai-tax-agent
 ```

 **Initialize Vault:**
@@ -62,19 +62,19 @@ docker exec -it vault vault operator unseal

 **Create MinIO Buckets:**
 ```bash
-docker exec -it minio mc alias set local http://localhost:9092 admin <MINIO_PASSWORD>
-docker exec -it minio mc mb local/documents
-docker exec -it minio mc mb local/models
+docker exec -it apa-minio mc alias set local http://localhost:9000 admin <MINIO_PASSWORD>
+docker exec -it apa-minio mc mb local/documents
+docker exec -it apa-minio mc mb local/models
 ```

 **Create NATS Streams:**
 ```bash
-docker exec -it nats nats stream add TAX_AGENT_EVENTS \
+docker exec -it apa-nats nats stream add TAX_AGENT_EVENTS \\
  --subjects="tax.>" --storage=file --retention=limits --max-age=7d
 ```

 **Configure Authentik:**
-1. Go to https://authentik.harkon.co.uk
+1. Go to https://auth.harkon.co.uk
 2. Create groups: `app-admin`, `app-user`, `app-reviewer`
 3. Create OAuth providers for:
   - Review UI: `app.harkon.co.uk`
@@ -94,7 +94,7 @@ curl -I https://api.harkon.co.uk/healthz
 curl -I https://grafana.harkon.co.uk

 # View logs
-./scripts/deploy-to-production.sh logs svc-ingestion
+./scripts/deploy-to-production.sh logs apa-svc-ingestion
 ```

 ---
@@ -127,8 +127,8 @@ curl -I https://grafana.harkon.co.uk
 ### Restart Service
 ```bash
 ssh deploy@141.136.35.199
-cd /opt/compose/ai-tax-agent
-docker compose -f services.yaml restart svc-ingestion
+cd /opt/ai-tax-agent
+docker compose -f services.yaml restart apa-svc-ingestion
 ```

 ### Check Status
@@ -163,25 +163,25 @@ docker compose -f services.yaml logs svc-ingestion
 docker compose -f infrastructure.yaml ps

 # Restart
-docker compose -f services.yaml restart svc-ingestion
+docker compose -f services.yaml restart apa-svc-ingestion
 ```

 ### SSL Issues
 ```bash
 # Check Traefik logs
-docker logs traefik
+docker logs apa-traefik

 # Check certificates
-sudo cat /opt/compose/traefik/certs/godaddy-acme.json | jq
+sudo cat /opt/ai-tax-agent/traefik/certs/godaddy-acme.json | jq
 ```

 ### Database Connection
 ```bash
 # Test Postgres
-docker exec -it postgres pg_isready -U postgres
+docker exec -it apa-postgres pg_isready -U postgres

 # Check env vars
-docker exec -it svc-ingestion env | grep POSTGRES
+docker exec -it apa-svc-ingestion env | grep POSTGRES
 ```

 ---
@@ -190,7 +190,7 @@ docker exec -it svc-ingestion env | grep POSTGRES

 ```bash
 ssh deploy@141.136.35.199
-cd /opt/compose/ai-tax-agent
+cd /opt/ai-tax-agent

 # Stop services
 docker compose -f services.yaml down
@@ -198,12 +198,11 @@ docker compose -f infrastructure.yaml down
 docker compose -f monitoring.yaml down

 # Restore backup
-cd /opt/compose
+cd /opt
 tar -xzf ~/backups/backup-YYYYMMDD-HHMMSS.tar.gz

-# Restart company services
-cd /opt/compose/traefik && docker compose up -d
-cd /opt/compose/authentik && docker compose up -d
+# Restart application infra
+cd /opt/ai-tax-agent && docker compose -f infrastructure.yaml up -d
 ```

 ---
@@ -242,4 +241,3 @@ cd /opt/compose/authentik && docker compose up -d
 ```bash
 ./scripts/deploy-to-production.sh logs <service>
 ```
-
--- a/libs/config.py
+++ b/libs/config.py
--- a/infra/configs/authentik/bootstrap.yaml
+++ b/infra/configs/authentik/bootstrap.yaml
--- a/infra/configs/loki/loki-config.yml
+++ b/infra/configs/loki/loki-config.yml
--- a/infra/configs/promtail/promtail-config.yml
+++ b/infra/configs/promtail/promtail-config.yml
--- a/infra/base/traefik/config/traefik-dynamic.yml
+++ b/infra/base/traefik/config/traefik-dynamic.yml
@@ -16,3 +16,49 @@ http:
          - X-authentik-meta-provider
          - X-authentik-meta-app
          - X-authentik-meta-version
+
+    # Large upload middleware for Gitea registry
+    gitea-large-upload:
+      buffering:
+        maxRequestBodyBytes: 5368709120 # 5GB
+        memRequestBodyBytes: 104857600 # 100MB
+        maxResponseBodyBytes: 5368709120 # 5GB
+        memResponseBodyBytes: 104857600 # 100MB
+        retryExpression: "IsNetworkError() && Attempts() < 3"
+
+    # Rate limiting for public APIs
+    api-ratelimit:
+      rateLimit:
+        average: 100
+        burst: 50
+        period: 1s
+
+    # Security headers
+    security-headers:
+      headers:
+        frameDeny: true
+        sslRedirect: true
+        browserXssFilter: true
+        contentTypeNosniff: true
+        stsIncludeSubdomains: true
+        stsPreload: true
+        stsSeconds: 31536000
+
+    # CORS headers
+    api-cors:
+      headers:
+        accessControlAllowMethods:
+          - GET
+          - POST
+          - PUT
+          - DELETE
+          - OPTIONS
+        accessControlAllowOriginList:
+          - "https://app.harkon.co.uk"
+        accessControlAllowHeaders:
+          - "Content-Type"
+          - "Authorization"
+        accessControlMaxAge: 100
+        addVaryHeader: true
+
+    # Security headers
--- a/infra/base/traefik/config/traefik.yml
+++ b/infra/base/traefik/config/traefik.yml
@@ -4,7 +4,9 @@ entryPoints:
    address: ":80"
  websecure:
    address: ":443"
-
+    transport:
+      respondingTimeouts:
+        readTimeout: 30m
 api:
  dashboard: true

--- a/infra/configs/traefik/app-middlewares.yml
+++ b/infra/configs/traefik/app-middlewares.yml
@@ -1,31 +0,0 @@
-# Application-specific Traefik middlewares
-# These are loaded by the application infrastructure, not the external Traefik
-
-http:
-  middlewares:
-    # Large upload middleware for Gitea registry
-    gitea-large-upload:
-      buffering:
-        maxRequestBodyBytes: 5368709120   # 5GB
-        memRequestBodyBytes: 104857600    # 100MB
-        maxResponseBodyBytes: 5368709120  # 5GB
-        memResponseBodyBytes: 104857600   # 100MB
-        retryExpression: "IsNetworkError() && Attempts() < 3"
-    
-    # Rate limiting for public APIs
-    api-ratelimit:
-      rateLimit:
-        average: 100
-        burst: 50
-        period: 1s
-    
-    # Security headers
-    security-headers:
-      headers:
-        frameDeny: true
-        sslRedirect: true
-        browserXssFilter: true
-        contentTypeNosniff: true
-        stsIncludeSubdomains: true
-        stsPreload: true
-        stsSeconds: 31536000
--- a/infra/configs/traefik/certs/local.crt
+++ b/infra/configs/traefik/certs/local.crt
@@ -1,25 +0,0 @@
-----BEGIN CERTIFICATE-----
-MIIEHjCCAwagAwIBAgIUbOm5g4Xhb08Lk6DIpVst7+xZHOswDQYJKoZIhvcNAQEL
-BQAwEDEOMAwGA1UEAwwFbG9jYWwwHhcNMjUwOTI4MTExNTM1WhcNMzUwOTI2MTEx
-NTM1WjAQMQ4wDAYDVQQDDAVsb2NhbDCCASIwDQYJKoZIhvcNAQEBBQADggEPADCC
-AQoCggEBAK0370DEo3dScS8uLwBsXkuaAHn9wO2fjxEHLZwHWfFo/16t+EEAi5c3
-zDs7nYQ7LPLndxBfO6xZ5uWKNIVtp6ARzAeRbGgbjXDdK3fOyRdhhKR3aZVOH1D0
-xUjEm/X5jEDv81sufSjk+DIQmh8hQnp3RwdHyhkIZUCTsBXMfnj+zs1UKTdRQBF5
-SUplGsbh6z3xCSI4jiNRb7mNHXqV3Fv6ycwF8YdthSDfueltBP4vT/CDtebkkKPF
-dx7YWEIPPUNqEoHqeI5iYP6gnWJYcr3vU+p2BuTwUICo+njzAf+P/SsjPHbujJob
-dbHUclBHIrIO4BpYZtY1a7E219MbqcECAwEAAaOCAW4wggFqMB0GA1UdDgQWBBQ7
-qHpza0Bb1xI1g7cMBx33JnFQljAfBgNVHSMEGDAWgBQ7qHpza0Bb1xI1g7cMBx33
-JnFQljAPBgNVHRMBAf8EBTADAQH/MIIBFQYDVR0RBIIBDDCCAQiCCWxvY2FsaG9z
-dIcEfwAAAYILKi5sb2NhbC5sYW6CDmF1dGgubG9jYWwubGFughFncmFmYW5hLmxv
-Y2FsLmxhboIQcmV2aWV3LmxvY2FsLmxhboINYXBpLmxvY2FsLmxhboIPdmF1bHQu
-bG9jYWwubGFugg9taW5pby5sb2NhbC5sYW6CE21pbmlvLWFwaS5sb2NhbC5sYW6C
-EHFkcmFudC5sb2NhbC5sYW6CD25lbzRqLmxvY2FsLmxhboIUcHJvbWV0aGV1cy5s
-b2NhbC5sYW6CDmxva2kubG9jYWwubGFughF1bmxlYXNoLmxvY2FsLmxhboIRdHJh
-ZWZpay5sb2NhbC5sYW4wDQYJKoZIhvcNAQELBQADggEBAICf+2MZ7BHbSD/pnvll
-G7Zmk+Bntj2F6RBQVZ2ZsKPWkHeZEYJDRvU0I2uL5tvvDJp4q0hjdluJllchhGgr
-qfu7i+kRnhzme7oyRTFGYp8b3zHBvLyJLmdIALxuNSjIEeh1Fx0lEhKwqOlA4y6T
-jziPmsGv3IonGJM2dURGNcR7DfG6H/Yl12qV8u/tVFTxqWL+hyCE7u8v+ZIcZ+fj
-82X7hXt1HvfP84EhVtfqQMb5xykLtXvPqggSCFXYIj2PanWdwEdE6P5Yr2D1Yz7r
-tzpmpoetrGoMWIeB0yiWgt0qJ/KK7meoCp64mqfBc48p1p/7kj2R/FRH1Jx3gFWy
-dT4=
-----END CERTIFICATE-----
--- a/infra/configs/traefik/certs/local.key
+++ b/infra/configs/traefik/certs/local.key
@@ -1,28 +0,0 @@
-----BEGIN PRIVATE KEY-----
-MIIEvAIBADANBgkqhkiG9w0BAQEFAASCBKYwggSiAgEAAoIBAQCtN+9AxKN3UnEv
-Li8AbF5LmgB5/cDtn48RBy2cB1nxaP9erfhBAIuXN8w7O52EOyzy53cQXzusWebl
-ijSFbaegEcwHkWxoG41w3St3zskXYYSkd2mVTh9Q9MVIxJv1+YxA7/NbLn0o5Pgy
-EJofIUJ6d0cHR8oZCGVAk7AVzH54/s7NVCk3UUAReUlKZRrG4es98QkiOI4jUW+5
-jR16ldxb+snMBfGHbYUg37npbQT+L0/wg7Xm5JCjxXce2FhCDz1DahKB6niOYmD+
-oJ1iWHK971Pqdgbk8FCAqPp48wH/j/0rIzx27oyaG3Wx1HJQRyKyDuAaWGbWNWux
-NtfTG6nBAgMBAAECggEAHvtkNcd2HX+HcxLloUPA0fDnqOo0OcxSQI9yHvhJpB5N
-nterEaVRUmjOhMGy+NXEwmWYLDt8ZuVloSTJJBxq4PyN68SdCTn0YH2Oqs03tpDg
-srIRFn10qHw/VTalVqed6HeCpYp5JHlf00SY7Hx8cX8oGytCAJw50AUad6ut62IM
-sp/QFdtkLhtq9vGzQUqyIP92Y/+GbxhB+eHkuvvFau1KJq7K8qhroFTwQFts9er2
-890Ujmz3bF2RhHixQcpXpsf/DMyylGJTbZDmSFkTDa/c1PzqvKrmL3wP7A3bk1E5
-CP8/a65ykotJEX8RkWqH2XxvRKpdWtCaeuCsmWUQ4QKBgQDTLbC9DWHCUYMWJhyW
-TKAeXx5xFGHIqggN28lIkXFiCVsTZyOuRDN7Q/CbOat/0JthrzyP18L+6ewZt2ZN
-RjdfGdnpUCJx6LR4dtBH8Rc+CjlSnqEgJIkgfIs8b9uEhMI1eQV+BAFQON3BzdpT
-wQ86aGsrdqtpfav7cImVfGcY/QKBgQDR+7OcnEwh8s/1J2niMKjk8agyCGGHWW4M
-g+vIv7lptavgEGOPMBv7QgmeuUjwSszphQXL36m39ZRmI5B+J0/onuQzv04tJeZY
-WZhA+T12a+1VnvUZNZm/qp0I2rW+4m+DmJoLQlvpaaFit/1fPJ6+IzI2VzPeWhw2
-vUQ5QIYhFQKBgFUWZc3mpGsNOMol1QLiIOnb3YImejfF+rTKx9FLeOnNZzrsJb5D
-kJKsDzgcBnPbc5/qYXZ7sv/O9OhvsvKTxh+1ZM3TEe3fm0emZ8l05K6EpBAcBkPT
-NMU4KUnSsBo2+6Fb/9CEgJr4LrG15bA1a5NXG0dJ60r37eHDuEvY8hlpAoGADWv2
-PhNrdlwL2NKtHO0ZTpD3vEL24OzhcOFZx9ohYtVe6BKEGpnrn/LHpKKZO+q8EE0V
-YsOoGH8U/jZVvQqMPAUz9u7Kc25Ru+H2Lmj/+brKT8e6SOM5MZwZL4CzT0Ev+Yxe
-hEu4jkHXM/Uot9arGuIrCngmc5b06LbOTo6GREUCgYArWyPYeETah/GVwU7/TNY5
-5f8lNbWBoXZfpVbWdoUZT6tGWciZsiXSR4x9f+1/LMIuChegSEazrJUDt7TbCkZs
-s4A66pnME37aYP2sMvJF3zSnQWVIyBgGI5xX0XW/WdozKl1mdFfigyWp58uo2dS2
-TxE3dy8rxpUdDCUmvJT/Fw==
-----END PRIVATE KEY-----
--- a/libs/neo/client.py
+++ b/libs/neo/client.py
@@ -134,7 +134,7 @@ class Neo4jClient:
        result = await self.run_query(query, {"properties": properties}, database)
        node = result[0]["n"] if result else {}
        # Return node ID if available, otherwise return the full node
-        return node.get("id", node)
+        return node.get("id", node)  # type: ignore

    async def update_node(
        self,
@@ -209,7 +209,7 @@ class Neo4jClient:
                database,
            )
            rel = result[0]["r"] if result else {}
-            return rel.get("id", rel)
+            return rel.get("id", rel)  # type: ignore

        # Original signature (using labels and IDs)
        rel_properties = properties or {}
@@ -231,7 +231,7 @@ class Neo4jClient:
        )
        rel = result[0]["r"] if result else {}
        # Return relationship ID if available, otherwise return the full relationship
-        return rel.get("id", rel)
+        return rel.get("id", rel)  # type: ignore

    async def get_node_lineage(
        self, node_id: str, max_depth: int = 10, database: str = "neo4j"
--- a/infra/configs/traefik/certs/acme.json
+++ b/infra/configs/traefik/certs/acme.json
--- a/libs/ocr/processor.py
+++ b/libs/ocr/processor.py
@@ -0,0 +1,507 @@
+import base64
+import concurrent.futures
+import io
+import json
+import os
+from pathlib import Path
+from typing import Any
+
+import numpy as np
+import requests
+from PIL import Image, ImageFilter
+from PyPDF2 import PdfReader
+
+
+class OCRProcessor:
+    def __init__(
+        self,
+        model_name: str = "llama3.2-vision:11b",
+        base_url: str = "http://localhost:11434/api/generate",
+        max_workers: int = 1,
+        provider: str = "ollama",
+        openai_api_key: str | None = None,
+        openai_base_url: str = "https://api.openai.com/v1/chat/completions",
+    ):
+        self.model_name = model_name
+        self.base_url = base_url
+        self.max_workers = max_workers
+        self.provider = provider.lower()
+        self.openai_api_key = openai_api_key or os.getenv("OPENAI_API_KEY")
+        self.openai_base_url = openai_base_url
+
+    def _encode_image(self, image_path: str) -> str:
+        """Convert image to base64 string"""
+        with open(image_path, "rb") as image_file:
+            return base64.b64encode(image_file.read()).decode("utf-8")
+
+    def _pdf_to_images(self, pdf_path: str) -> list[str]:
+        """
+        Convert each page of a PDF to an image without PyMuPDF.
+        Strategy: extract largest embedded image per page via PyPDF2.
+        Saves each selected image as a temporary PNG and returns paths.
+
+        Note: Text-only pages with no embedded images will be skipped here.
+        Use _pdf_extract_text as a fallback for such pages.
+        """
+        image_paths: list[str] = []
+        try:
+            reader = PdfReader(pdf_path)
+            for page_index, page in enumerate(reader.pages):
+                try:
+                    resources = page.get("/Resources")
+                    if resources is None:
+                        continue
+                    xobject = resources.get("/XObject")
+                    if xobject is None:
+                        continue
+                    xobject = xobject.get_object()
+                    largest = None
+                    largest_area = -1
+                    for _, obj_ref in xobject.items():
+                        try:
+                            obj = obj_ref.get_object()
+                            if obj.get("/Subtype") != "/Image":
+                                continue
+                            width = int(obj.get("/Width", 0))
+                            height = int(obj.get("/Height", 0))
+                            area = width * height
+                            if area > largest_area:
+                                largest = obj
+                                largest_area = area
+                        except Exception:
+                            continue
+
+                    if largest is None:
+                        continue
+
+                    data = largest.get_data()
+                    filt = largest.get("/Filter")
+                    out_path = f"{pdf_path}_page{page_index}.png"
+                    # If JPEG/JPX, write bytes directly; else convert via PIL
+                    if filt in ("/DCTDecode",):
+                        # JPEG
+                        out_path = f"{pdf_path}_page{page_index}.jpg"
+                        with open(out_path, "wb") as f:
+                            f.write(data)
+                    elif filt in ("/JPXDecode",):
+                        out_path = f"{pdf_path}_page{page_index}.jp2"
+                        with open(out_path, "wb") as f:
+                            f.write(data)
+                    else:
+                        mode = "RGB"
+                        colorspace = largest.get("/ColorSpace")
+                        if colorspace in ("/DeviceGray",):
+                            mode = "L"
+                        width = int(largest.get("/Width", 0))
+                        height = int(largest.get("/Height", 0))
+                        try:
+                            img = Image.frombytes(mode, (width, height), data)
+                        except Exception:
+                            # Best-effort decode via Pillow
+                            img = Image.open(io.BytesIO(data))
+                        img.save(out_path, format="PNG")
+
+                    image_paths.append(out_path)
+                except Exception:
+                    # Continue gracefully for problematic pages/objects
+                    continue
+            return image_paths
+        except Exception as e:
+            raise ValueError(f"Could not extract images from PDF: {e}")
+
+    def _pdf_extract_text(self, pdf_path: str) -> list[str]:
+        """Extract text per page using pdfplumber if available, else PyPDF2."""
+        texts: list[str] = []
+        try:
+            try:
+                import pdfplumber
+
+                with pdfplumber.open(pdf_path) as pdf:
+                    for page in pdf.pages:
+                        texts.append(page.extract_text() or "")
+                return texts
+            except Exception:
+                # Fallback to PyPDF2
+                reader = PdfReader(pdf_path)
+                for page in reader.pages:  # type: ignore
+                    texts.append(page.extract_text() or "")
+                return texts
+        except Exception as e:
+            raise ValueError(f"Could not extract text from PDF: {e}")
+
+    def _call_ollama_vision(self, prompt: str, image_base64: str) -> str:
+        payload = {
+            "model": self.model_name,
+            "prompt": prompt,
+            "stream": False,
+            "images": [image_base64],
+        }
+        response = requests.post(self.base_url, json=payload)
+        response.raise_for_status()
+        return response.json().get("response", "")  # type: ignore
+
+    def _call_openai_vision(self, prompt: str, image_base64: str) -> str:
+        if not self.openai_api_key:
+            raise ValueError("OPENAI_API_KEY not set")
+        # Compose chat.completions payload for GPT-4o/mini vision
+        payload = {
+            "model": self.model_name or "gpt-4o-mini",
+            "messages": [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": prompt},
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": f"data:image/jpeg;base64,{image_base64}",
+                            },
+                        },
+                    ],
+                }
+            ],
+            "temperature": 0,
+        }
+        headers = {
+            "Authorization": f"Bearer {self.openai_api_key}",
+            "Content-Type": "application/json",
+        }
+        response = requests.post(self.openai_base_url, headers=headers, json=payload)
+        response.raise_for_status()
+        data = response.json()
+        try:
+            return data["choices"][0]["message"]["content"]  # type: ignore
+        except Exception:
+            return json.dumps(data)
+
+    def _preprocess_image(self, image_path: str, language: str = "en") -> str:
+        """
+        Preprocess image before OCR using Pillow + NumPy:
+        - Convert to grayscale
+        - Histogram equalization (contrast)
+        - Median denoise
+        - Otsu threshold and invert
+        """
+        try:
+            with Image.open(image_path) as img:
+                if img.mode in ("RGBA", "LA"):
+                    img = img.convert("RGB")
+                gray = img.convert("L")
+
+                # Histogram equalization via cumulative distribution
+                arr = np.asarray(gray)
+                hist, _ = np.histogram(arr.flatten(), 256, [0, 256])  # type: ignore
+                cdf = hist.cumsum()
+                cdf_masked = np.ma.masked_equal(cdf, 0)  # type: ignore
+                cdf_min = cdf_masked.min() if cdf_masked.size else 0
+                cdf_max = cdf_masked.max() if cdf_masked.size else 0
+                if cdf_max == cdf_min:
+                    eq = arr
+                else:
+                    cdf_scaled = (cdf_masked - cdf_min) * 255 / (cdf_max - cdf_min)
+                    lut = np.ma.filled(cdf_scaled, 0).astype("uint8")
+                    eq = lut[arr]
+
+                eq_img = Image.fromarray(eq, mode="L")
+                # Median filter (3x3) to reduce noise
+                eq_img = eq_img.filter(ImageFilter.MedianFilter(size=3))
+                arr_eq = np.asarray(eq_img)
+
+                # Otsu threshold
+                hist2, _ = np.histogram(arr_eq, 256, [0, 256])  # type: ignore
+                total = arr_eq.size
+                sum_total = (np.arange(256) * hist2).sum()
+                sum_b = 0.0
+                w_b = 0.0
+                max_var = 0.0
+                thr = 0
+                for t in range(256):
+                    w_b += hist2[t]
+                    if w_b == 0:
+                        continue
+                    w_f = total - w_b
+                    if w_f == 0:
+                        break
+                    sum_b += t * hist2[t]
+                    m_b = sum_b / w_b
+                    m_f = (sum_total - sum_b) / w_f
+                    var_between = w_b * w_f * (m_b - m_f) ** 2
+                    if var_between > max_var:
+                        max_var = var_between
+                        thr = t
+
+                binary = (arr_eq > thr).astype(np.uint8) * 255
+                # Invert: black text on white background
+                binary = 255 - binary
+
+                out_img = Image.fromarray(binary, mode="L")
+                preprocessed_path = f"{image_path}_preprocessed.jpg"
+                out_img.save(preprocessed_path, format="JPEG", quality=95)
+                return preprocessed_path
+        except Exception as e:
+            raise ValueError(f"Failed to preprocess image {image_path}: {e}")
+
+    def process_image(
+        self,
+        image_path: str,
+        format_type: str = "markdown",
+        preprocess: bool = True,
+        custom_prompt: str | None = None,
+        language: str = "en",
+    ) -> str:
+        """
+        Process an image (or PDF) and extract text in the specified format
+
+        Args:
+            image_path: Path to the image file or PDF file
+            format_type: One of ["markdown", "text", "json", "structured", "key_value","custom"]
+            preprocess: Whether to apply image preprocessing
+            custom_prompt: If provided, this prompt overrides the default based on format_type
+            language: Language code to apply language specific OCR preprocessing
+        """
+        try:
+            # If the input is a PDF, process all pages
+            if image_path.lower().endswith(".pdf"):
+                image_pages = self._pdf_to_images(image_path)
+                responses: list[str] = []
+                if image_pages:
+                    for idx, page_file in enumerate(image_pages):
+                        # Process each page with preprocessing if enabled
+                        if preprocess:
+                            preprocessed_path = self._preprocess_image(
+                                page_file, language
+                            )
+                        else:
+                            preprocessed_path = page_file
+
+                        image_base64 = self._encode_image(preprocessed_path)
+
+                        if custom_prompt and custom_prompt.strip():
+                            prompt = custom_prompt
+                        else:
+                            prompts = {
+                                "markdown": f"""Extract all text content from this image in {language} **exactly as it appears**, without modification, summarization, or omission.
+                                Format the output in markdown:
+                                - Use headers (#, ##, ###) **only if they appear in the image**
+                                - Preserve original lists (-, *, numbered lists) as they are
+                                - Maintain all text formatting (bold, italics, underlines) exactly as seen
+                                - **Do not add, interpret, or restructure any content**
+                            """,
+                                "text": f"""Extract all visible text from this image in {language} **without any changes**.
+                                - **Do not summarize, paraphrase, or infer missing text.**
+                                - Retain all spacing, punctuation, and formatting exactly as in the image.
+                                - If text is unclear or partially visible, extract as much as possible without guessing.
+                                - **Include all text, even if it seems irrelevant or repeated.**
+                                """,
+                                "json": f"""Extract all text from this image in {language} and format it as JSON, **strictly preserving** the structure.
+                                - **Do not summarize, add, or modify any text.**
+                                - Maintain hierarchical sections and subsections as they appear.
+                                - Use keys that reflect the document's actual structure (e.g., "title", "body", "footer").
+                                - Include all text, even if fragmented, blurry, or unclear.
+                                """,
+                                "structured": f"""Extract all text from this image in {language}, **ensuring complete structural accuracy**:
+                                - Identify and format tables **without altering content**.
+                                - Preserve list structures (bulleted, numbered) **exactly as shown**.
+                                - Maintain all section headings, indents, and alignments.
+                                - **Do not add, infer, or restructure the content in any way.**
+                                """,
+                                "key_value": f"""Extract all key-value pairs from this image in {language} **exactly as they appear**:
+                                - Identify and extract labels and their corresponding values without modification.
+                                - Maintain the exact wording, punctuation, and order.
+                                - Format each pair as 'key: value' **only if clearly structured that way in the image**.
+                                - **Do not infer missing values or add any extra text.**
+                                """,
+                                "table": f"""Extract all tabular data from this image in {language} **exactly as it appears**, without modification, summarization, or omission.
+                                - **Preserve the table structure** (rows, columns, headers) as closely as possible.
+                                - **Do not add missing values or infer content**—if a cell is empty, leave it empty.
+                                - Maintain all numerical, textual, and special character formatting.
+                                - If the table contains merged cells, indicate them clearly without altering their meaning.
+                                - Output the table in a structured format such as Markdown, CSV, or JSON, based on the intended use.
+                                """,
+                            }
+                            prompt = prompts.get(format_type, prompts["text"])
+
+                        # Route to chosen provider
+                        if self.provider == "openai":
+                            res = self._call_openai_vision(prompt, image_base64)
+                        else:
+                            res = self._call_ollama_vision(prompt, image_base64)
+
+                        responses.append(f"Page {idx + 1}:\n{res}")
+
+                        # Clean up temporary files
+                        if preprocess and preprocessed_path.endswith(
+                            "_preprocessed.jpg"
+                        ):
+                            try:
+                                os.remove(preprocessed_path)
+                            except OSError:
+                                pass
+                        if page_file.endswith((".png", ".jpg", ".jp2")):
+                            try:
+                                os.remove(page_file)
+                            except OSError:
+                                pass
+
+                    final_result = "\n".join(responses)
+                    if format_type == "json":
+                        try:
+                            json_data = json.loads(final_result)
+                            return json.dumps(json_data, indent=2)
+                        except json.JSONDecodeError:
+                            return final_result
+                    return final_result
+                else:
+                    # Fallback: no images found; extract raw text per page
+                    text_pages = self._pdf_extract_text(image_path)
+                    combined = []
+                    for i, t in enumerate(text_pages):
+                        combined.append(f"Page {i + 1}:\n{t}")
+                    return "\n".join(combined)
+
+            # Process non-PDF images as before.
+            if preprocess:
+                image_path = self._preprocess_image(image_path, language)
+
+            image_base64 = self._encode_image(image_path)
+
+            # Clean up temporary files
+            if image_path.endswith(("_preprocessed.jpg", "_temp.jpg")):
+                os.remove(image_path)
+
+            if custom_prompt and custom_prompt.strip():
+                prompt = custom_prompt
+                print("Using custom prompt:", prompt)
+            else:
+                prompts = {
+                    "markdown": f"""Extract all text content from this image in {language} **exactly as it appears**, without modification, summarization, or omission.
+                                Format the output in markdown:
+                                - Use headers (#, ##, ###) **only if they appear in the image**
+                                - Preserve original lists (-, *, numbered lists) as they are
+                                - Maintain all text formatting (bold, italics, underlines) exactly as seen
+                                - **Do not add, interpret, or restructure any content**
+                            """,
+                    "text": f"""Extract all visible text from this image in {language} **without any changes**.
+                                - **Do not summarize, paraphrase, or infer missing text.**
+                                - Retain all spacing, punctuation, and formatting exactly as in the image.
+                                - If text is unclear or partially visible, extract as much as possible without guessing.
+                                - **Include all text, even if it seems irrelevant or repeated.**
+                                """,
+                    "json": f"""Extract all text from this image in {language} and format it as JSON, **strictly preserving** the structure.
+                                - **Do not summarize, add, or modify any text.**
+                                - Maintain hierarchical sections and subsections as they appear.
+                                - Use keys that reflect the document's actual structure (e.g., "title", "body", "footer").
+                                - Include all text, even if fragmented, blurry, or unclear.
+                                """,
+                    "structured": f"""Extract all text from this image in {language}, **ensuring complete structural accuracy**:
+                                - Identify and format tables **without altering content**.
+                                - Preserve list structures (bulleted, numbered) **exactly as shown**.
+                                - Maintain all section headings, indents, and alignments.
+                                - **Do not add, infer, or restructure the content in any way.**
+                                """,
+                    "key_value": f"""Extract all key-value pairs from this image in {language} **exactly as they appear**:
+                                - Identify and extract labels and their corresponding values without modification.
+                                - Maintain the exact wording, punctuation, and order.
+                                - Format each pair as 'key: value' **only if clearly structured that way in the image**.
+                                - **Do not infer missing values or add any extra text.**
+                                """,
+                    "table": f"""Extract all tabular data from this image in {language} **exactly as it appears**, without modification, summarization, or omission.
+                                - **Preserve the table structure** (rows, columns, headers) as closely as possible.
+                                - **Do not add missing values or infer content**—if a cell is empty, leave it empty.
+                                - Maintain all numerical, textual, and special character formatting.
+                                - If the table contains merged cells, indicate them clearly without altering their meaning.
+                                - Output the table in a structured format such as Markdown, CSV, or JSON, based on the intended use.
+                                """,
+                }
+                prompt = prompts.get(format_type, prompts["text"])
+                print("Using default prompt:", prompt)  # Debug print
+
+            # Call chosen provider with single image
+            if self.provider == "openai":
+                result = self._call_openai_vision(prompt, image_base64)
+            else:
+                result = self._call_ollama_vision(prompt, image_base64)
+
+            if format_type == "json":
+                try:
+                    json_data = json.loads(result)
+                    return json.dumps(json_data, indent=2)
+                except json.JSONDecodeError:
+                    return str(result)
+
+            return str(result)
+        except Exception as e:
+            return f"Error processing image: {str(e)}"
+
+    def process_batch(
+        self,
+        input_path: str | list[str],
+        format_type: str = "markdown",
+        recursive: bool = False,
+        preprocess: bool = True,
+        custom_prompt: str | None = None,
+        language: str = "en",
+    ) -> dict[str, Any]:
+        """
+        Process multiple images in batch
+
+        Args:
+            input_path: Path to directory or list of image paths
+            format_type: Output format type
+            recursive: Whether to search directories recursively
+            preprocess: Whether to apply image preprocessing
+            custom_prompt: If provided, this prompt overrides the default for each image
+            language: Language code to apply language specific OCR preprocessing
+
+        Returns:
+            Dictionary with results and statistics
+        """
+        # Collect all image paths
+        image_paths: list[str | Path] = []
+        if isinstance(input_path, str):
+            base_path = Path(input_path)
+            if base_path.is_dir():
+                pattern = "**/*" if recursive else "*"
+                for ext in [".png", ".jpg", ".jpeg", ".pdf", ".tiff"]:
+                    image_paths.extend(base_path.glob(f"{pattern}{ext}"))
+            else:
+                image_paths = [base_path]
+        else:
+            image_paths = [Path(p) for p in input_path]
+
+        results = {}
+        errors = {}
+
+        # Process images in parallel
+        with concurrent.futures.ThreadPoolExecutor(
+            max_workers=self.max_workers
+        ) as executor:
+            future_to_path = {
+                executor.submit(
+                    self.process_image,
+                    str(path),
+                    format_type,
+                    preprocess,
+                    custom_prompt,
+                    language,
+                ): path
+                for path in image_paths
+            }
+
+            for future in concurrent.futures.as_completed(future_to_path):
+                path = future_to_path[future]
+                try:
+                    results[str(path)] = future.result()
+                except Exception as e:
+                    errors[str(path)] = str(e)
+                    # pbar.update(1)
+
+        return {
+            "results": results,
+            "errors": errors,
+            "statistics": {
+                "total": len(image_paths),
+                "successful": len(results),
+                "failed": len(errors),
+            },
+        }
--- a/libs/requirements-base.txt
+++ b/libs/requirements-base.txt
@@ -1,13 +1,13 @@
 # Core framework dependencies (Required by all services)
-fastapi>=0.118.0
+fastapi>=0.119.0
 uvicorn[standard]>=0.37.0
-pydantic>=2.11.9
+pydantic>=2.12.0
 pydantic-settings>=2.11.0

 # Database drivers (lightweight)
-sqlalchemy>=2.0.43
+sqlalchemy>=2.0.44
 asyncpg>=0.30.0
-psycopg2-binary>=2.9.10
+psycopg2-binary>=2.9.11
 neo4j>=6.0.2
 redis[hiredis]>=6.4.0

--- a/libs/requirements-pdf.txt
+++ b/libs/requirements-pdf.txt
@@ -3,3 +3,4 @@ pdfrw>=0.4
 reportlab>=4.4.4
 PyPDF2>=3.0.1
 pdfplumber>=0.11.7
+opencv-python
--- a/libs/storage/client.py
+++ b/libs/storage/client.py
@@ -79,7 +79,7 @@ class StorageClient:
        """Download object from bucket"""
        try:
            response = self.client.get_object(bucket_name, object_name)
-            data = response.read()
+            data: bytes = response.read()
            response.close()
            response.release_conn()

@@ -89,7 +89,7 @@ class StorageClient:
                object=object_name,
                size=len(data),
            )
-            return data  # type: ignore
+            return data

        except S3Error as e:
            logger.error(
--- a/mypy.ini
+++ b/mypy.ini
@@ -18,3 +18,7 @@ disallow_untyped_defs = False

 [mypy-minio.*]
 ignore_missing_imports = True
+
+[mypy-pytesseract.*]
+follow_untyped_imports = True
+ignore_missing_imports = True
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -54,11 +54,20 @@ dependencies = [
    "pytesseract>=0.3.10",
    "Pillow>=10.1.0",
    "playwright>=1.40.0",
-    "pyshaql>=0.25.0",
+    "pyshacl>=0.25.0",
    "rdflib>=7.0.0",
    "spacy>=3.7.0",
    "presidio-analyzer>=2.2.0",
    "presidio-anonymizer>=2.2.0",
+    "jsonschema>=4.0.0",
+    "boto3>=1.0.0",
+    "aiokafka>=0.8.0",
+    "hvac>=1.0.0",
+    "nats-py>=2.0.0",
+    "pydantic-settings>=2.0.0",
+    "opentelemetry-exporter-otlp>=1.0.0",
+    "opentelemetry-instrumentation-psycopg2>=0.42b0",
+    "opentelemetry-instrumentation-redis>=0.42b0",
 ]

 [project.optional-dependencies]
--- a/requirements.txt
+++ b/requirements.txt
@@ -56,6 +56,10 @@ numpy>=2.3.3
 # PDF processing
 pdfrw>=0.4
 reportlab>=4.4.4
+PyPDF2>=3.0.1
+pdf2image>=1.17.0
+pytesseract>=0.3.10
+Pillow>=10.3.0

 # Date and time utilities
 python-dateutil>=2.9.0
@@ -94,3 +98,4 @@ black>=25.9.0
 isort>=6.0.1
 bandit>=1.8.6
 safety>=3.6.2
+opencv-python
--- a/scripts/deploy-to-production.sh
+++ b/scripts/deploy-to-production.sh
@@ -7,9 +7,9 @@ set -e

 # Configuration
 REMOTE_HOST="deploy@141.136.35.199"
-REMOTE_PATH="/opt/compose/ai-tax-agent"
-LOCAL_COMPOSE_PATH="infra/compose/production"
-ENV_FILE="infra/compose/.env.production"
+REMOTE_PATH="/opt/ai-tax-agent"
+LOCAL_COMPOSE_PATH="infra/base"
+ENV_FILE="infra/environments/production/.env"

 # Colors for output
 RED='\033[0;31m'
@@ -66,13 +66,15 @@ backup_remote() {
    ssh $REMOTE_HOST << 'EOF'
        set -e
        mkdir -p ~/backups
-        cd /opt/compose
+        cd /opt
        
-        # Backup compose directory (exclude large cert files)
-        tar -czf ~/backups/backup-$(date +%Y%m%d-%H%M%S).tar.gz \
-            --exclude='./traefik/certs/godaddy-acme.json' \
-            --exclude='./*/node_modules' \
-            .
+        # Backup application directory (exclude large cert files)
+        if [ -d ai-tax-agent ]; then
+            tar -czf ~/backups/backup-$(date +%Y%m%d-%H%M%S).tar.gz \
+                --exclude='./traefik/certs/godaddy-acme.json' \
+                --exclude='./*/node_modules' \
+                ai-tax-agent
+        fi
        
        # Document current state
        docker ps > ~/backups/current-services-$(date +%Y%m%d-%H%M%S).txt
@@ -100,6 +102,9 @@ prepare_remote() {
        mkdir -p $REMOTE_PATH/grafana/provisioning
        mkdir -p $REMOTE_PATH/grafana/dashboards
        mkdir -p $REMOTE_PATH/loki
+        mkdir -p $REMOTE_PATH/promtail
+        mkdir -p $REMOTE_PATH/traefik/config
+        mkdir -p $REMOTE_PATH/authentik
        
        echo "Directory structure created"
        ls -la $REMOTE_PATH
@@ -110,7 +115,7 @@ EOF

 # Copy files to remote server
 copy_files() {
-    log_info "Copying compose files to remote server..."
+    log_info "Copying base compose files and configs to remote server..."
    
    # Copy compose files
    scp $LOCAL_COMPOSE_PATH/infrastructure.yaml $REMOTE_HOST:$REMOTE_PATH/
@@ -121,10 +126,13 @@ copy_files() {
    scp $ENV_FILE $REMOTE_HOST:$REMOTE_PATH/.env
    
    # Copy configuration files
-    scp -r infra/compose/prometheus/* $REMOTE_HOST:$REMOTE_PATH/prometheus/
-    scp -r infra/compose/grafana/provisioning/* $REMOTE_HOST:$REMOTE_PATH/grafana/provisioning/
-    scp -r infra/compose/grafana/dashboards/* $REMOTE_HOST:$REMOTE_PATH/grafana/dashboards/
-    scp -r infra/compose/loki/* $REMOTE_HOST:$REMOTE_PATH/loki/
+    scp -r $LOCAL_COMPOSE_PATH/prometheus/* $REMOTE_HOST:$REMOTE_PATH/prometheus/
+    scp -r $LOCAL_COMPOSE_PATH/grafana/provisioning/* $REMOTE_HOST:$REMOTE_PATH/grafana/provisioning/
+    scp -r $LOCAL_COMPOSE_PATH/grafana/dashboards/* $REMOTE_HOST:$REMOTE_PATH/grafana/dashboards/
+    scp -r $LOCAL_COMPOSE_PATH/loki/* $REMOTE_HOST:$REMOTE_PATH/loki/
+    scp -r $LOCAL_COMPOSE_PATH/promtail/* $REMOTE_HOST:$REMOTE_PATH/promtail/ 2>/dev/null || true
+    scp -r $LOCAL_COMPOSE_PATH/traefik/config/* $REMOTE_HOST:$REMOTE_PATH/traefik/config/ 2>/dev/null || true
+    scp -r $LOCAL_COMPOSE_PATH/authentik/* $REMOTE_HOST:$REMOTE_PATH/authentik/ 2>/dev/null || true
    
    log_success "Files copied to remote server"
 }
--- a/tests/e2e/test_happy_path.py
+++ b/tests/e2e/test_happy_path.py
@@ -1,555 +1,4 @@
-# ROLE
+import pytest

-You are a **Senior Platform Engineer + Backend Lead** generating **production code** and **ops assets** for a microservice suite that powers an accounting Knowledge Graph + Vector RAG platform. Authentication/authorization are centralized at the **edge via Traefik + Authentik** (ForwardAuth). **Services are trust-bound** to Traefik and consume user/role claims via forwarded headers/JWT.
-
-# MISSION
-
-Produce fully working code for **all application services** (FastAPI + Python 3.12) with:
-
- Solid domain models, Pydantic v2 schemas, type hints, strict mypy, ruff lint.
- Opentelemetry tracing, Prometheus metrics, structured logging.
- Vault-backed secrets, MinIO S3 client, Qdrant client, Neo4j driver, Postgres (SQLAlchemy), Redis.
- Eventing (Kafka or SQS/SNS behind an interface).
- Deterministic data contracts, end-to-end tests, Dockerfiles, Compose, CI for Gitea.
- Traefik labels + Authentik Outpost integration for every exposed route.
- Zero PII in vectors (Qdrant), evidence-based lineage in KG, and bitemporal writes.
-
-# GLOBAL CONSTRAINTS (APPLY TO ALL SERVICES)
-
- **Language & Runtime:** Python **3.12**.
- **Frameworks:** FastAPI, Pydantic v2, SQLAlchemy 2, httpx, aiokafka or boto3 (pluggable), redis-py, opentelemetry-instrumentation-fastapi, prometheus-fastapi-instrumentator.
- **Config:** `pydantic-settings` with `.env` overlay. Provide `Settings` class per service.
- **Secrets:** HashiCorp **Vault** (AppRole/JWT). Use Vault Transit to **envelope-encrypt** sensitive fields before persistence (helpers provided in `lib/security.py`).
- **Auth:** No OIDC in services. Add `TrustedProxyMiddleware`:
-
-  - Reject if request not from internal network (configurable CIDR).
-  - Require headers set by Traefik+Authentik (`X-Authenticated-User`, `X-Authenticated-Email`, `X-Authenticated-Groups`, `Authorization: Bearer …`).
-  - Parse groups → `roles` list on `request.state`.
-
- **Observability:**
-
-  - OpenTelemetry (traceparent propagation), span attrs (service, route, user, tenant).
-  - Prometheus metrics endpoint `/metrics` protected by internal network check.
-  - Structured JSON logs (timestamp, level, svc, trace_id, msg) via `structlog`.
-
- **Errors:** Global exception handler → RFC7807 Problem+JSON (`type`, `title`, `status`, `detail`, `instance`, `trace_id`).
- **Testing:** `pytest`, `pytest-asyncio`, `hypothesis` (property tests for calculators), `coverage ≥ 90%` per service.
- **Static:** `ruff`, `mypy --strict`, `bandit`, `safety`, `licensecheck`.
- **Perf:** Each service exposes `/healthz`, `/readyz`, `/livez`; cold start < 500ms; p95 endpoint < 250ms (local).
- **Containers:** Distroless or slim images; non-root user; read-only FS; `/tmp` mounted for OCR where needed.
- **Docs:** OpenAPI JSON + ReDoc; MkDocs site with service READMEs.
-
-# SHARED LIBS (GENERATE ONCE, REUSE)
-
-Create `libs/` used by all services:
-
- `libs/config.py` – base `Settings`, env parsing, Vault client factory, MinIO client factory, Qdrant client factory, Neo4j driver factory, Redis factory, Kafka/SQS client factory.
- `libs/security.py` – Vault Transit helpers (`encrypt_field`, `decrypt_field`), header parsing, internal-CIDR validator.
- `libs/observability.py` – otel init, prometheus instrumentor, logging config.
- `libs/events.py` – abstract `EventBus` with `publish(topic, payload: dict)`, `subscribe(topic, handler)`. Two impls: Kafka (`aiokafka`) and SQS/SNS (`boto3`).
- `libs/schemas.py` – **canonical Pydantic models** shared across services (Document, Evidence, IncomeItem, etc.) mirroring the ontology schemas. Include JSONSchema exports.
- `libs/storage.py` – S3/MinIO helpers (bucket ensure, put/get, presigned).
- `libs/neo.py` – Neo4j session helpers, Cypher runner with retry, SHACL validator invoker (pySHACL on exported RDF).
- `libs/rag.py` – Qdrant collections CRUD, hybrid search (dense+sparse), rerank wrapper, de-identification utilities (regex + NER; hash placeholders).
- `libs/forms.py` – PDF AcroForm fill via `pdfrw` with overlay fallback via `reportlab`.
- `libs/calibration.py` – `calibrated_confidence(raw_score, method="temperature_scaling", params=...)`.
-
-# EVENT TOPICS (STANDARDIZE)
-
- `doc.ingested`, `doc.ocr_ready`, `doc.extracted`, `kg.upserted`, `rag.indexed`, `calc.schedule_ready`, `form.filled`, `hmrc.submitted`, `review.requested`, `review.completed`, `firm.sync.completed`
-
-Each payload MUST include: `event_id (ulid)`, `occurred_at (iso)`, `actor`, `tenant_id`, `trace_id`, `schema_version`, and a `data` object (service-specific).
-
-# TRUST HEADERS FROM TRAEFIK + AUTHENTIK (USE EXACT KEYS)
-
- `X-Authenticated-User` (string)
- `X-Authenticated-Email` (string)
- `X-Authenticated-Groups` (comma-separated)
- `Authorization` (`Bearer <jwt>` from Authentik)
-  Reject any request missing these (except `/healthz|/readyz|/livez|/metrics` from internal CIDR).
-
---
-
-## SERVICES TO IMPLEMENT (CODE FOR EACH)
-
-### 1) `svc-ingestion`
-
-**Purpose:** Accept uploads or URLs, checksum, store to MinIO, emit `doc.ingested`.
-
-**Endpoints:**
-
- `POST /v1/ingest/upload` (multipart file, metadata: `tenant_id`, `kind`, `source`) → `{doc_id, s3_url, checksum}`
- `POST /v1/ingest/url` (json: `{url, kind, tenant_id}`) → downloads to MinIO
- `GET /v1/docs/{doc_id}` → metadata
-
-**Logic:**
-
- Compute SHA256, dedupe by checksum; MinIO path `tenants/{tenant_id}/raw/{doc_id}.pdf`.
- Store metadata in Postgres table `ingest_documents` (alembic migrations).
- Publish `doc.ingested` with `{doc_id, bucket, key, pages?, mime}`.
-
-**Env:** `S3_BUCKET_RAW`, `MINIO_*`, `DB_URL`.
-
-**Traefik labels:** route `/ingest/*`.
-
---
-
-### 2) `svc-rpa`
-
-**Purpose:** Scheduled RPA pulls from firm/client portals via Playwright.
-
-**Tasks:**
-
- Playwright login flows (credentials from Vault), 2FA via Authentik OAuth device or OTP secret in Vault.
- Download statements/invoices; hand off to `svc-ingestion` via internal POST.
- Prefect flows: `pull_portal_X()`, `pull_portal_Y()` with schedules.
-
-**Endpoints:**
-
- `POST /v1/rpa/run/{connector}` (manual trigger)
- `GET /v1/rpa/status/{run_id}`
-
-**Env:** `VAULT_ADDR`, `VAULT_ROLE_ID`, `VAULT_SECRET_ID`.
-
---
-
-### 3) `svc-ocr`
-
-**Purpose:** OCR & layout extraction.
-
-**Pipeline:**
-
- Pull object from MinIO, detect rotation/de-skew (`opencv-python`), split pages (`pymupdf`), OCR (`pytesseract`) or bypass if text layer present (`pdfplumber`).
- Output per-page text + **bbox** for lines/words.
- Write JSON to MinIO `tenants/{tenant_id}/ocr/{doc_id}.json` and emit `doc.ocr_ready`.
-
-**Endpoints:**
-
- `POST /v1/ocr/{doc_id}` (idempotent trigger)
- `GET /v1/ocr/{doc_id}` (fetch OCR JSON)
-
-**Env:** `TESSERACT_LANGS`, `S3_BUCKET_EVIDENCE`.
-
---
-
-### 4) `svc-extract`
-
-**Purpose:** Classify docs and extract KV + tables into **schema-constrained JSON** (with bbox/page).
-
-**Endpoints:**
-
- `POST /v1/extract/{doc_id}` body: `{strategy: "llm|rules|hybrid"}`
- `GET /v1/extract/{doc_id}` → structured JSON
-
-**Implementation:**
-
- Use prompt files in `prompts/`: `doc_classify.txt`, `kv_extract.txt`, `table_extract.txt`.
- **Validator loop**: run LLM → validate JSONSchema → retry with error messages up to N times.
- Return Pydantic models from `libs/schemas.py`.
- Emit `doc.extracted`.
-
-**Env:** `LLM_ENGINE`, `TEMPERATURE`, `MAX_TOKENS`.
-
---
-
-### 5) `svc-normalize-map`
-
-**Purpose:** Normalize & map extracted data to KG.
-
-**Logic:**
-
- Currency normalization (ECB or static fx table), dates, UK tax year/basis period inference.
- Entity resolution (blocking + fuzzy).
- Generate nodes/edges (+ `Evidence` with doc_id/page/bbox/text_hash).
- Use `libs/neo.py` to write with **bitemporal** fields; run **SHACL** validator; on violation, queue `review.requested`.
- Emit `kg.upserted`.
-
-**Endpoints:**
-
- `POST /v1/map/{doc_id}`
- `GET /v1/map/{doc_id}/preview` (diff view, to be used by UI)
-
-**Env:** `NEO4J_*`.
-
---
-
-### 6) `svc-kg`
-
-**Purpose:** Graph façade + RDF/SHACL utility.
-
-**Endpoints:**
-
- `GET /v1/kg/nodes/{label}/{id}`
- `POST /v1/kg/cypher` (admin-gated inline query; must check `admin` role)
- `POST /v1/kg/export/rdf` (returns RDF for SHACL)
- `POST /v1/kg/validate` (run pySHACL against `schemas/shapes.ttl`)
- `GET /v1/kg/lineage/{node_id}` (traverse `DERIVED_FROM` → Evidence)
-
-**Env:** `NEO4J_*`.
-
---
-
-### 7) `svc-rag-indexer`
-
-**Purpose:** Build Qdrant indices (firm knowledge, legislation, best practices, glossary).
-
-**Workflow:**
-
- Load sources (filesystem, URLs, Firm DMS via `svc-firm-connectors`).
- **De-identify PII** (regex + NER), replace with placeholders; store mapping only in Postgres.
- Chunk (layout-aware) per `retrieval/chunking.yaml`.
- Compute **dense** embeddings (e.g., `bge-small-en-v1.5`) and **sparse** (Qdrant sparse).
- Upsert to Qdrant with payload `{jurisdiction, tax_years[], topic_tags[], version, pii_free: true, doc_id/section_id/url}`.
- Emit `rag.indexed`.
-
-**Endpoints:**
-
- `POST /v1/index/run`
- `GET /v1/index/status/{run_id}`
-
-**Env:** `QDRANT_URL`, `RAG_EMBEDDING_MODEL`, `RAG_RERANKER_MODEL`.
-
---
-
-### 8) `svc-rag-retriever`
-
-**Purpose:** Hybrid search + KG fusion with rerank and calibrated confidence.
-
-**Endpoint:**
-
- `POST /v1/rag/search` `{query, tax_year?, jurisdiction?, k?}` →
-
-  ```
-  {
-    "chunks": [...],
-    "citations": [{doc_id|url, section_id?, page?, bbox?}],
-    "kg_hints": [{rule_id, formula_id, node_ids[]}],
-    "calibrated_confidence": 0.0-1.0
-  }
-  ```
-
-**Implementation:**
-
- Hybrid score: `alpha * dense + beta * sparse`; rerank top-K via cross-encoder; **KG fusion** (boost chunks citing Rules/Calculations relevant to schedule).
- Use `libs/calibration.py` to expose calibrated confidence.
-
---
-
-### 9) `svc-reason`
-
-**Purpose:** Deterministic calculators + materializers (UK SA).
-
-**Endpoints:**
-
- `POST /v1/reason/compute_schedule` `{tax_year, taxpayer_id, schedule_id}`
- `GET /v1/reason/explain/{schedule_id}` → rationale & lineage paths
-
-**Implementation:**
-
- Pure functions for: employment, self-employment, property (FHL, 20% interest credit), dividends/interest, allowances, NIC (Class 2/4), HICBC, student loans (Plans 1/2/4/5, PGL).
- **Deterministic order** as defined; rounding per `FormBox.rounding_rule`.
- Use Cypher from `kg/reasoning/schedule_queries.cypher` to materialize box values; attach `DERIVED_FROM` evidence.
-
---
-
-### 10) `svc-forms`
-
-**Purpose:** Fill PDFs and assemble evidence bundles.
-
-**Endpoints:**
-
- `POST /v1/forms/fill` `{tax_year, taxpayer_id, form_id}` → returns PDF (binary)
- `POST /v1/forms/evidence_pack` `{scope}` → ZIP + manifest + signed hashes (sha256)
-
-**Implementation:**
-
- `pdfrw` for AcroForm; overlay with ReportLab if needed.
- Manifest includes `doc_id/page/bbox/text_hash` for every numeric field.
-
---
-
-### 11) `svc-hmrc`
-
-**Purpose:** HMRC submitter (stub|sandbox|live).
-
-**Endpoints:**
-
- `POST /v1/hmrc/submit` `{tax_year, taxpayer_id, dry_run}` → `{status, submission_id?, errors[]}`
- `GET /v1/hmrc/submissions/{id}`
-
-**Implementation:**
-
- Rate limits, retries/backoff, signed audit log; environment toggle.
-
---
-
-### 12) `svc-firm-connectors`
-
-**Purpose:** Read-only connectors to Firm Databases (Practice Mgmt, DMS).
-
-**Endpoints:**
-
- `POST /v1/firm/sync` `{since?}` → `{objects_synced, errors[]}`
- `GET /v1/firm/objects` (paged)
-
-**Implementation:**
-
- Data contracts in `config/firm_contracts/`; mappers → Secure Client Data Store (Postgres) with lineage columns (`source`, `source_id`, `synced_at`).
-
---
-
-### 13) `ui-review` (outline only)
-
- Next.js (SSO handled by Traefik+Authentik), shows extracted fields + evidence snippets; POST overrides to `svc-extract`/`svc-normalize-map`.
-
---
-
-## DATA CONTRACTS (ESSENTIAL EXAMPLES)
-
-**Event: `doc.ingested`**
-
-```json
-{
-  "event_id": "01J...ULID",
-  "occurred_at": "2025-09-13T08:00:00Z",
-  "actor": "svc-ingestion",
-  "tenant_id": "t_123",
-  "trace_id": "abc-123",
-  "schema_version": "1.0",
-  "data": {
-    "doc_id": "d_abc",
-    "bucket": "raw",
-    "key": "tenants/t_123/raw/d_abc.pdf",
-    "checksum": "sha256:...",
-    "kind": "bank_statement",
-    "mime": "application/pdf",
-    "pages": 12
-  }
-}
-```
-
-**RAG search response shape**
-
-```json
-{
-  "chunks": [
-    {
-      "id": "c1",
-      "text": "...",
-      "score": 0.78,
-      "payload": {
-        "jurisdiction": "UK",
-        "tax_years": ["2024-25"],
-        "topic_tags": ["FHL"],
-        "pii_free": true
-      }
-    }
-  ],
-  "citations": [
-    { "doc_id": "leg-ITA2007", "section_id": "s272A", "url": "https://..." }
-  ],
-  "kg_hints": [
-    {
-      "rule_id": "UK.FHL.Qual",
-      "formula_id": "FHL_Test_v1",
-      "node_ids": ["n123", "n456"]
-    }
-  ],
-  "calibrated_confidence": 0.81
-}
-```
-
---
-
-## PERSISTENCE SCHEMAS (POSTGRES; ALEMBIC)
-
- `ingest_documents(id pk, tenant_id, doc_id, kind, checksum, bucket, key, mime, pages, created_at)`
- `firm_objects(id pk, tenant_id, source, source_id, type, payload jsonb, synced_at)`
- Qdrant PII mapping table (if absolutely needed): `pii_links(id pk, placeholder_hash, client_id, created_at)` — **encrypt with Vault Transit**; do NOT store raw values.
-
---
-
-## TRAEFIK + AUTHENTIK (COMPOSE LABELS PER SERVICE)
-
-For every service container in `infra/compose/docker-compose.local.yml`, add labels:
-
-```
- "traefik.enable=true"
- "traefik.http.routers.svc-extract.rule=Host(`api.local`) && PathPrefix(`/extract`)"
- "traefik.http.routers.svc-extract.entrypoints=websecure"
- "traefik.http.routers.svc-extract.tls=true"
- "traefik.http.routers.svc-extract.middlewares=authentik-forwardauth,rate-limit"
- "traefik.http.services.svc-extract.loadbalancer.server.port=8000"
-```
-
-Use the shared dynamic file `traefik-dynamic.yml` with `authentik-forwardauth` and `rate-limit` middlewares.
-
---
-
-## OUTPUT FORMAT (STRICT)
-
-Implement a **multi-file codebase** as fenced blocks, EXACTLY in this order:
-
-```txt
-# FILE: libs/config.py
-# factories for Vault/MinIO/Qdrant/Neo4j/Redis/EventBus, Settings base
-...
-```
-
-```txt
-# FILE: libs/security.py
-# Vault Transit helpers, header parsing, internal CIDR checks, middleware
-...
-```
-
-```txt
-# FILE: libs/observability.py
-# otel init, prometheus, structlog
-...
-```
-
-```txt
-# FILE: libs/events.py
-# EventBus abstraction with Kafka and SQS/SNS impls
-...
-```
-
-```txt
-# FILE: libs/schemas.py
-# Shared Pydantic models mirroring ontology entities
-...
-```
-
-```txt
-# FILE: apps/svc-ingestion/main.py
-# FastAPI app, endpoints, MinIO write, Postgres, publish doc.ingested
-...
-```
-
-```txt
-# FILE: apps/svc-rpa/main.py
-# Playwright flows, Prefect tasks, triggers
-...
-```
-
-```txt
-# FILE: apps/svc-ocr/main.py
-# OCR pipeline, endpoints
-...
-```
-
-```txt
-# FILE: apps/svc-extract/main.py
-# Classifier + extractors with validator loop
-...
-```
-
-```txt
-# FILE: apps/svc-normalize-map/main.py
-# normalization, entity resolution, KG mapping, SHACL validation call
-...
-```
-
-```txt
-# FILE: apps/svc-kg/main.py
-# KG façade, RDF export, SHACL validate, lineage traversal
-...
-```
-
-```txt
-# FILE: apps/svc-rag-indexer/main.py
-# chunk/de-id/embed/upsert to Qdrant
-...
-```
-
-```txt
-# FILE: apps/svc-rag-retriever/main.py
-# hybrid retrieval + rerank + KG fusion
-...
-```
-
-```txt
-# FILE: apps/svc-reason/main.py
-# deterministic calculators, schedule compute/explain
-...
-```
-
-```txt
-# FILE: apps/svc-forms/main.py
-# PDF fill + evidence pack
-...
-```
-
-```txt
-# FILE: apps/svc-hmrc/main.py
-# submit stub|sandbox|live with audit + retries
-...
-```
-
-```txt
-# FILE: apps/svc-firm-connectors/main.py
-# connectors to practice mgmt & DMS, sync to Postgres
-...
-```
-
-```txt
-# FILE: infra/compose/docker-compose.local.yml
-# Traefik, Authentik, Vault, MinIO, Qdrant, Neo4j, Postgres, Redis, Prom+Grafana, Loki, Unleash, all services
-...
-```
-
-```txt
-# FILE: infra/compose/traefik.yml
-# static Traefik config
-...
-```
-
-```txt
-# FILE: infra/compose/traefik-dynamic.yml
-# forwardAuth middleware + routers/services
-...
-```
-
-```txt
-# FILE: .gitea/workflows/ci.yml
-# lint->test->build->scan->push->deploy
-...
-```
-
-```txt
-# FILE: Makefile
-# bootstrap, run, test, lint, build, deploy, format, seed
-...
-```
-
-```txt
-# FILE: tests/e2e/test_happy_path.py
-# end-to-end: ingest -> ocr -> extract -> map -> compute -> fill -> (stub) submit
-...
-```
-
-```txt
-# FILE: tests/unit/test_calculators.py
-# boundary tests for UK SA logic (NIC, HICBC, PA taper, FHL)
-...
-```
-
-```txt
-# FILE: README.md
-# how to run locally with docker-compose, Authentik setup, Traefik certs
-...
-```
-
-## DEFINITION OF DONE
-
- `docker compose up` brings the full stack up; SSO via Authentik; routes secured via Traefik ForwardAuth.
- Running `pytest` yields ≥ 90% coverage; `make e2e` passes the ingest→…→submit stub flow.
- All services expose `/healthz|/readyz|/livez|/metrics`; OpenAPI at `/docs`.
- No PII stored in Qdrant; vectors carry `pii_free=true`.
- KG writes are SHACL-validated; violations produce `review.requested` events.
- Evidence lineage is present for every numeric box value.
- Gitea pipeline passes: lint, test, build, scan, push, deploy.
-
-# START
-
-Generate the full codebase and configs in the **exact file blocks and order** specified above.
+def test_happy_path():
+    pass