ai-tax-agent/apps/svc_normalize_map/main.py

"""Data normalization and knowledge graph mapping."""

# FILE: apps/svc-normalize-map/main.py
# pylint: disable=wrong-import-position,import-error,too-few-public-methods,global-statement
# pylint: disable=global-variable-not-assigned,raise-missing-from,unused-argument
# pylint: disable=broad-exception-caught,no-else-return,too-many-arguments,too-many-positional-arguments
# pylint: disable=too-many-locals,import-outside-toplevel,too-many-statements
# mypy: disable-error-code=union-attr


import os

# Import shared libraries
import sys
from datetime import datetime
from decimal import Decimal
from typing import Any

import structlog
import ulid
from fastapi import BackgroundTasks, Depends, HTTPException, Request
from fastapi.responses import JSONResponse

sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))

from libs.app_factory import create_app
from libs.config import (
    BaseAppSettings,
    create_event_bus,
    create_minio_client,
    create_neo4j_client,
)
from libs.events import EventBus, EventPayload, EventTopics
from libs.neo import Neo4jClient
from libs.observability import get_metrics, get_tracer, setup_observability
from libs.schemas import ErrorResponse
from libs.security import get_current_user, get_tenant_id
from libs.storage import DocumentStorage, StorageClient

logger = structlog.get_logger()


class NormalizeMapSettings(BaseAppSettings):
    """Settings for normalize-map service"""

    service_name: str = "svc-normalize-map"

    # Normalization configuration
    currency_default: str = "GBP"
    date_formats: list[str] = [
        "%Y-%m-%d",
        "%d/%m/%Y",
        "%d-%m-%Y",
        "%d %B %Y",
        "%d %b %Y",
        "%B %d, %Y",
    ]

    # Mapping configuration
    confidence_threshold: float = 0.7
    auto_create_entities: bool = True

    # Validation rules
    max_amount: float = 1000000.0  # £1M
    min_confidence: float = 0.5


# Create app and settings
app, settings = create_app(
    service_name="svc-normalize-map",
    title="Tax Agent Normalize-Map Service",
    description="Data normalization and knowledge graph mapping service",
    settings_class=NormalizeMapSettings,
)

# Global clients
storage_client: StorageClient | None = None
document_storage: DocumentStorage | None = None
neo4j_client: Neo4jClient | None = None
event_bus: EventBus | None = None
tracer = get_tracer("svc-normalize-map")
metrics = get_metrics()


@app.on_event("startup")
async def startup_event() -> None:
    """Initialize service dependencies"""
    global storage_client, document_storage, neo4j_client, event_bus

    logger.info("Starting normalize-map service")

    # Setup observability
    setup_observability(settings)

    # Initialize MinIO client
    minio_client = create_minio_client(settings)
    storage_client = StorageClient(minio_client)
    document_storage = DocumentStorage(storage_client)

    # Initialize Neo4j client
    neo4j_driver = create_neo4j_client(settings)
    neo4j_client = Neo4jClient(neo4j_driver)

    # Initialize event bus
    event_bus = create_event_bus(settings)
    await event_bus.start()

    # Subscribe to extraction completion events
    await event_bus.subscribe(  # type: ignore
        EventTopics.DOC_EXTRACTED, _handle_extraction_completed
    )

    logger.info("Normalize-map service started successfully")


@app.on_event("shutdown")
async def shutdown_event() -> None:
    """Cleanup service dependencies"""
    global event_bus, neo4j_client

    logger.info("Shutting down normalize-map service")

    if neo4j_client:
        await neo4j_client.close()

    if event_bus:
        await event_bus.stop()

    logger.info("Normalize-map service shutdown complete")


@app.get("/health")
async def health_check() -> dict[str, Any]:
    """Health check endpoint"""
    return {
        "status": "healthy",
        "service": settings.service_name,
        "version": settings.service_version,
        "timestamp": datetime.utcnow().isoformat(),
    }


@app.post("/normalize/{doc_id}")
async def normalize_document(
    doc_id: str,
    background_tasks: BackgroundTasks,
    current_user: dict[str, Any] = Depends(get_current_user),
    tenant_id: str = Depends(get_tenant_id),
) -> dict[str, Any]:
    """Normalize and map document data to knowledge graph"""

    with tracer.start_as_current_span("normalize_document") as span:
        span.set_attribute("doc_id", doc_id)
        span.set_attribute("tenant_id", tenant_id)

        try:
            # Check if extraction results exist
            extraction_results = await document_storage.get_extraction_result(
                tenant_id, doc_id
            )
            if not extraction_results:
                raise HTTPException(
                    status_code=404, detail="Extraction results not found"
                )

            # Generate normalization ID
            normalization_id = str(ulid.new())
            span.set_attribute("normalization_id", normalization_id)

            # Start background normalization
            background_tasks.add_task(
                _normalize_and_map_async,
                doc_id,
                tenant_id,
                extraction_results,
                normalization_id,
                current_user.get("sub", "system"),
            )

            logger.info(
                "Normalization started",
                doc_id=doc_id,
                normalization_id=normalization_id,
            )

            return {
                "normalization_id": normalization_id,
                "doc_id": doc_id,
                "status": "processing",
            }

        except HTTPException:
            raise
        except Exception as e:
            logger.error("Failed to start normalization", doc_id=doc_id, error=str(e))
            raise HTTPException(status_code=500, detail="Failed to start normalization")


async def _handle_extraction_completed(topic: str, payload: EventPayload) -> None:
    """Handle extraction completion events"""
    try:
        data = payload.data
        doc_id = data.get("doc_id")
        tenant_id = data.get("tenant_id")
        confidence = data.get("confidence", 0.0)

        if not doc_id or not tenant_id:
            logger.warning("Invalid extraction completion event", data=data)
            return

        # Only auto-process if confidence is above threshold
        if confidence >= settings.confidence_threshold:
            logger.info(
                "Auto-normalizing extracted document",
                doc_id=doc_id,
                confidence=confidence,
            )

            extraction_results = data.get("extraction_results")
            if not extraction_results:
                extraction_results = await document_storage.get_extraction_result(
                    tenant_id, doc_id
                )

            if extraction_results:
                await _normalize_and_map_async(
                    doc_id=doc_id,
                    tenant_id=tenant_id,
                    extraction_results=extraction_results,
                    normalization_id=str(ulid.new()),
                    actor=payload.actor,
                )
        else:
            logger.info(
                "Skipping auto-normalization due to low confidence",
                doc_id=doc_id,
                confidence=confidence,
            )

    except Exception as e:
        logger.error("Failed to handle extraction completion", error=str(e))


async def _normalize_and_map_async(
    doc_id: str,
    tenant_id: str,
    extraction_results: dict[str, Any],
    normalization_id: str,
    actor: str,
) -> None:
    """Normalize and map data asynchronously"""

    with tracer.start_as_current_span("normalize_and_map_async") as span:
        span.set_attribute("doc_id", doc_id)
        span.set_attribute("normalization_id", normalization_id)

        try:
            extracted_fields = extraction_results.get("extracted_fields", {})
            provenance = extraction_results.get("provenance", [])

            # Normalize extracted data
            normalized_data = await _normalize_data(extracted_fields, provenance)

            # Map to knowledge graph entities
            entities = await _map_to_entities(normalized_data, doc_id, tenant_id)

            # Store entities in knowledge graph
            stored_entities = await _store_entities(entities, tenant_id)

            # Create normalization results
            normalization_results = {
                "doc_id": doc_id,
                "normalization_id": normalization_id,
                "normalized_at": datetime.utcnow().isoformat(),
                "normalized_data": normalized_data,
                "entities": stored_entities,
                "entity_count": len(stored_entities),
            }

            logger.info("Normalization completed", results=normalization_results)

            # Update metrics
            metrics.counter("documents_normalized_total").labels(
                tenant_id=tenant_id
            ).inc()

            metrics.histogram("entities_created").labels(tenant_id=tenant_id).observe(
                len(stored_entities)
            )

            # Publish completion event
            event_payload = EventPayload(
                data={
                    "doc_id": doc_id,
                    "tenant_id": tenant_id,
                    "normalization_id": normalization_id,
                    "entity_count": len(stored_entities),
                    "entities": stored_entities,
                },
                actor=actor,
                tenant_id=tenant_id,
            )

            await event_bus.publish(EventTopics.KG_UPSERTED, event_payload)

            logger.info(
                "Normalization completed", doc_id=doc_id, entities=len(stored_entities)
            )

        except Exception as e:
            logger.error("Normalization failed", doc_id=doc_id, error=str(e))

            # Update error metrics
            metrics.counter("normalization_errors_total").labels(
                tenant_id=tenant_id, error_type=type(e).__name__
            ).inc()


async def _normalize_data(
    extracted_fields: dict[str, Any], provenance: list[dict[str, Any]]
) -> dict[str, Any]:
    """Normalize extracted data"""

    normalized = {}

    for field_name, raw_value in extracted_fields.items():
        try:
            if "amount" in field_name.lower() or "total" in field_name.lower():
                normalized[field_name] = _normalize_amount(raw_value)
            elif "date" in field_name.lower():
                normalized[field_name] = _normalize_date(raw_value)
            elif "name" in field_name.lower():
                normalized[field_name] = _normalize_name(raw_value)
            elif "address" in field_name.lower():
                normalized[field_name] = _normalize_address(raw_value)
            elif "number" in field_name.lower():
                normalized[field_name] = _normalize_number(raw_value)
            else:
                normalized[field_name] = _normalize_text(raw_value)

        except Exception as e:
            logger.warning(
                "Failed to normalize field",
                field=field_name,
                value=raw_value,
                error=str(e),
            )
            normalized[field_name] = raw_value  # Keep original value

    return normalized


def _normalize_amount(value: str) -> dict[str, Any]:
    """Normalize monetary amount"""
    import re

    if not value:
        return {"amount": None, "currency": settings.currency_default}

    # Remove currency symbols and formatting
    clean_value = re.sub(r"[£$€,\s]", "", str(value))

    try:
        amount = Decimal(clean_value)

        # Validate amount
        if amount > settings.max_amount:
            logger.warning("Amount exceeds maximum", amount=amount)

        return {
            "amount": float(amount),
            "currency": settings.currency_default,
            "original": value,
        }
    except Exception:
        return {
            "amount": None,
            "currency": settings.currency_default,
            "original": value,
        }


def _normalize_date(value: str) -> dict[str, Any]:
    """Normalize date"""
    from dateutil import parser

    if not value:
        return {"date": None, "original": value}

    try:
        # Try parsing with dateutil first
        parsed_date = parser.parse(str(value), dayfirst=True)
        return {"date": parsed_date.date().isoformat(), "original": value}
    except Exception:
        # Try manual formats
        for fmt in settings.date_formats:
            try:
                parsed_date = datetime.strptime(str(value), fmt)
                return {"date": parsed_date.date().isoformat(), "original": value}
            except Exception:
                continue

        return {"date": None, "original": value}


def _normalize_name(value: str) -> dict[str, Any]:
    """Normalize person/company name"""
    if not value:
        return {"name": None, "original": value}

    # Clean and title case
    clean_name = str(value).strip().title()

    # Detect if it's a company (contains Ltd, Limited, etc.)
    company_indicators = ["Ltd", "Limited", "Plc", "Inc", "Corp", "Company"]
    is_company = any(indicator in clean_name for indicator in company_indicators)

    return {
        "name": clean_name,
        "type": "company" if is_company else "person",
        "original": value,
    }


def _normalize_address(value: str) -> dict[str, Any]:
    """Normalize address"""
    import re

    if not value:
        return {"address": None, "original": value}

    clean_address = str(value).strip()

    # Extract UK postcode
    postcode_pattern = r"\b[A-Z]{1,2}\d[A-Z0-9]?\s*\d[A-Z]{2}\b"
    postcode_match = re.search(postcode_pattern, clean_address, re.IGNORECASE)
    postcode = postcode_match.group().upper() if postcode_match else None

    return {"address": clean_address, "postcode": postcode, "original": value}


def _normalize_number(value: str) -> dict[str, Any]:
    """Normalize reference numbers"""
    import re

    if not value:
        return {"number": None, "original": value}

    # Remove spaces and special characters
    clean_number = re.sub(r"[^\w]", "", str(value))

    # Detect number type
    number_type = "unknown"
    if len(clean_number) == 10 and clean_number.isdigit():
        number_type = "utr"  # UTR is 10 digits
    elif len(clean_number) == 8 and clean_number.isdigit():
        number_type = "account_number"
    elif re.match(r"^\d{6}$", clean_number):
        number_type = "sort_code"

    return {"number": clean_number, "type": number_type, "original": value}


def _normalize_text(value: str) -> dict[str, Any]:
    """Normalize general text"""
    if not value:
        return {"text": None, "original": value}

    clean_text = str(value).strip()

    return {"text": clean_text, "original": value}


async def _map_to_entities(
    normalized_data: dict[str, Any], doc_id: str, tenant_id: str
) -> list[dict[str, Any]]:
    """Map normalized data to knowledge graph entities"""

    entities = []

    # Create document entity
    doc_entity = {
        "type": "Document",
        "id": doc_id,
        "properties": {
            "doc_id": doc_id,
            "tenant_id": tenant_id,
            "processed_at": datetime.utcnow().isoformat(),
            "source": "extraction",
            "extractor_version": "1.0.0",
            "valid_from": datetime.utcnow(),
            "asserted_at": datetime.utcnow(),
        },
    }
    entities.append(doc_entity)

    # Map specific field types to entities
    for field_name, normalized_value in normalized_data.items():
        if isinstance(normalized_value, dict):
            if "amount" in normalized_value and normalized_value["amount"] is not None:
                # Create expense or income item
                entity_type = (
                    "ExpenseItem" if "expense" in field_name.lower() else "IncomeItem"
                )
                entity = {
                    "type": entity_type,
                    "id": f"{entity_type.lower()}_{ulid.new()}",
                    "properties": {
                        "amount": normalized_value["amount"],
                        "currency": normalized_value["currency"],
                        "description": field_name,
                        "source": doc_id,
                        "extractor_version": "1.0.0",
                        "valid_from": datetime.utcnow(),
                        "asserted_at": datetime.utcnow(),
                    },
                }
                entities.append(entity)

            elif "name" in normalized_value and normalized_value["name"] is not None:
                # Create party entity
                entity = {
                    "type": "Party",
                    "id": f"party_{ulid.new()}",
                    "properties": {
                        "name": normalized_value["name"],
                        "party_type": normalized_value.get("type", "unknown"),
                        "source": doc_id,
                        "extractor_version": "1.0.0",
                        "valid_from": datetime.utcnow(),
                        "asserted_at": datetime.utcnow(),
                    },
                }
                entities.append(entity)

    return entities


async def _store_entities(
    entities: list[dict[str, Any]], tenant_id: str
) -> list[dict[str, Any]]:
    """Store entities in knowledge graph"""

    stored_entities = []

    for entity in entities:
        try:
            # Create node in Neo4j
            result = await neo4j_client.create_node(
                label=entity["type"], properties=entity["properties"]
            )

            stored_entities.append(
                {
                    "type": entity["type"],
                    "id": entity["id"],
                    "neo4j_id": result.get("id"),
                    "properties": entity["properties"],
                }
            )

            logger.debug("Entity stored", type=entity["type"], id=entity["id"])

        except Exception as e:
            logger.error("Failed to store entity", entity=entity, error=str(e))

    return stored_entities


@app.exception_handler(HTTPException)
async def http_exception_handler(request: Request, exc: HTTPException) -> JSONResponse:
    """Handle HTTP exceptions with RFC7807 format"""
    return JSONResponse(
        status_code=exc.status_code,
        content=ErrorResponse(
            type=f"https://httpstatuses.com/{exc.status_code}",
            title=exc.detail,
            status=exc.status_code,
            detail=exc.detail,
            instance=str(request.url),
            trace_id="",
        ).dict(),
    )


if __name__ == "__main__":
    import uvicorn

    uvicorn.run("main:app", host="0.0.0.0", port=8004, reload=True, log_config=None)