completed local setup with compose

2025-11-26 13:17:17 +00:00
parent 8fe5e62fee
commit fdba81809f
87 changed files with 5610 additions and 3376 deletions
--- a/apps/svc_normalize_map/Dockerfile
+++ b/apps/svc_normalize_map/Dockerfile
@@ -1,53 +1,27 @@
-# Multi-stage build for svc_normalize_map
-FROM python:3.12-slim AS builder
+FROM python:3.12-slim-bookworm

-# Install build dependencies
-RUN apt-get update && apt-get install -y \
-    build-essential \
-    curl \
-    && rm -rf /var/lib/apt/lists/*
+# Set environment variables
+ENV PYTHONUNBUFFERED 1
+ENV APP_HOME /app

-# Create virtual environment
-RUN python -m venv /opt/venv
-ENV PATH="/opt/venv/bin:$PATH"
+# Create and set working directory
+WORKDIR $APP_HOME

-# Copy requirements and install dependencies
+# Install dependencies
 COPY libs/requirements-base.txt /tmp/libs-requirements.txt
 COPY apps/svc_normalize_map/requirements.txt /tmp/requirements.txt
-RUN pip install --no-cache-dir --upgrade pip && \
-    pip install --no-cache-dir -r /tmp/libs-requirements.txt -r /tmp/requirements.txt
-
-# Production stage
-FROM python:3.12-slim
-
-# Install runtime dependencies
-RUN apt-get update && apt-get install -y \
-    curl \
-    && rm -rf /var/lib/apt/lists/* \
-    && groupadd -r appuser \
-    && useradd -r -g appuser appuser
-
-# Copy virtual environment from builder
-COPY --from=builder /opt/venv /opt/venv
-ENV PATH="/opt/venv/bin:$PATH"
-
-# Set working directory
-WORKDIR /app
+RUN pip install --no-cache-dir -r /tmp/libs-requirements.txt -r /tmp/requirements.txt

 # Copy application code
 COPY libs/ ./libs/
 COPY apps/svc_normalize_map/ ./apps/svc_normalize_map/

-# Create non-root user and set permissions
-RUN chown -R appuser:appuser /app
-USER appuser
-
-# Health check
-HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
-    CMD curl -f http://localhost:8000/healthz || exit 1
-
 # Expose port
+
 EXPOSE 8000

+
+
 # Run the application
+
 CMD ["python", "-m", "uvicorn", "apps.svc_normalize_map.main:app", "--host", "0.0.0.0", "--port", "8000"]
--- a/apps/svc_normalize_map/main.py
+++ b/apps/svc_normalize_map/main.py
@@ -1,24 +1,11 @@
-"""Data normalization and knowledge graph mapping."""
-
-# FILE: apps/svc-normalize-map/main.py
-# pylint: disable=wrong-import-position,import-error,too-few-public-methods,global-statement
-# pylint: disable=global-variable-not-assigned,raise-missing-from,unused-argument
-# pylint: disable=broad-exception-caught,no-else-return,too-many-arguments,too-many-positional-arguments
-# pylint: disable=too-many-locals,import-outside-toplevel,too-many-statements
-# mypy: disable-error-code=union-attr
-
-
 import os
-
-# Import shared libraries
 import sys
-from datetime import datetime
-from decimal import Decimal
-from typing import Any
+from datetime import UTC, datetime
+from typing import Any, cast

 import structlog
 import ulid
-from fastapi import BackgroundTasks, Depends, HTTPException, Request
+from fastapi import HTTPException, Request
 from fastapi.responses import JSONResponse

 sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
@@ -34,83 +21,68 @@ from libs.events import EventBus, EventPayload, EventTopics
 from libs.neo import Neo4jClient
 from libs.observability import get_metrics, get_tracer, setup_observability
 from libs.schemas import ErrorResponse
-from libs.security import get_current_user, get_tenant_id
 from libs.storage import DocumentStorage, StorageClient

 logger = structlog.get_logger()


 class NormalizeMapSettings(BaseAppSettings):
-    """Settings for normalize-map service"""
+    """Settings for NormalizeMap service"""

    service_name: str = "svc-normalize-map"

-    # Normalization configuration
-    currency_default: str = "GBP"
-    date_formats: list[str] = [
-        "%Y-%m-%d",
-        "%d/%m/%Y",
-        "%d-%m-%Y",
-        "%d %B %Y",
-        "%d %b %Y",
-        "%B %d, %Y",
-    ]
-
-    # Mapping configuration
-    confidence_threshold: float = 0.7
-    auto_create_entities: bool = True
-
-    # Validation rules
-    max_amount: float = 1000000.0  # £1M
-    min_confidence: float = 0.5
-
-
-# Create app and settings
-app, settings = create_app(
-    service_name="svc-normalize-map",
-    title="Tax Agent Normalize-Map Service",
-    description="Data normalization and knowledge graph mapping service",
-    settings_class=NormalizeMapSettings,
-)

 # Global clients
 storage_client: StorageClient | None = None
 document_storage: DocumentStorage | None = None
-neo4j_client: Neo4jClient | None = None
 event_bus: EventBus | None = None
-tracer = get_tracer("svc-normalize-map")
-metrics = get_metrics()
+neo4j_client: Neo4jClient | None = None
+
+settings: NormalizeMapSettings


-@app.on_event("startup")
-async def startup_event() -> None:
+async def init_dependencies(app_settings: NormalizeMapSettings) -> None:
    """Initialize service dependencies"""
-    global storage_client, document_storage, neo4j_client, event_bus
+    global storage_client, document_storage, event_bus, neo4j_client, settings

-    logger.info("Starting normalize-map service")
+    settings = app_settings
+    logger.info("Starting NormalizeMap service")

-    # Setup observability
    setup_observability(settings)

-    # Initialize MinIO client
    minio_client = create_minio_client(settings)
    storage_client = StorageClient(minio_client)
    document_storage = DocumentStorage(storage_client)

-    # Initialize Neo4j client
    neo4j_driver = create_neo4j_client(settings)
    neo4j_client = Neo4jClient(neo4j_driver)

-    # Initialize event bus
    event_bus = create_event_bus(settings)
+    if not event_bus:
+        raise HTTPException(status_code=500, detail="Event bus not initialized")
    await event_bus.start()

-    # Subscribe to extraction completion events
-    await event_bus.subscribe(  # type: ignore
-        EventTopics.DOC_EXTRACTED, _handle_extraction_completed
-    )
+    await event_bus.subscribe(EventTopics.DOC_EXTRACTED, _handle_document_extracted)

-    logger.info("Normalize-map service started successfully")
+    logger.info("NormalizeMap service started successfully")
+
+
+app, _settings = create_app(
+    service_name="svc-normalize-map",
+    title="Tax Agent Normalize and Map Service",
+    description="Normalize extracted data and map to Knowledge Graph",
+    settings_class=NormalizeMapSettings,
+)
+
+
+# Initialize dependencies immediately
+@app.on_event("startup")
+async def startup_event():  # type: ignore
+    await init_dependencies(cast(NormalizeMapSettings, _settings))
+
+
+tracer = get_tracer("svc-normalize-map")
+metrics = get_metrics()


@app.on_event("shutdown")
@@ -118,456 +90,235 @@ async def shutdown_event() -> None:
    """Cleanup service dependencies"""
    global event_bus, neo4j_client

-    logger.info("Shutting down normalize-map service")
-
-    if neo4j_client:
-        await neo4j_client.close()
-
+    logger.info("Shutting down NormalizeMap service")
    if event_bus:
        await event_bus.stop()
-
-    logger.info("Normalize-map service shutdown complete")
+    if neo4j_client:
+        await neo4j_client.close()
+    logger.info("NormalizeMap service shutdown complete")


-@app.get("/health")
-async def health_check() -> dict[str, Any]:
-    """Health check endpoint"""
-    return {
-        "status": "healthy",
-        "service": settings.service_name,
-        "version": settings.service_version,
-        "timestamp": datetime.utcnow().isoformat(),
-    }
+async def _handle_document_extracted(topic: str, payload: EventPayload) -> None:
+    """Handle document extracted events"""
+    data = payload.data
+    doc_id = data.get("doc_id")
+    tenant_id = data.get("tenant_id")
+    extracted_fields = data.get("extraction_results", {}).get("extracted_fields", {})
+    provenance = data.get("extraction_results", {}).get("provenance", [])

+    if not doc_id or not tenant_id or not extracted_fields:
+        logger.warning("Invalid document extracted event", data=data)
+        return

-@app.post("/normalize/{doc_id}")
-async def normalize_document(
-    doc_id: str,
-    background_tasks: BackgroundTasks,
-    current_user: dict[str, Any] = Depends(get_current_user),
-    tenant_id: str = Depends(get_tenant_id),
-) -> dict[str, Any]:
-    """Normalize and map document data to knowledge graph"""
-
-    with tracer.start_as_current_span("normalize_document") as span:
+    with tracer.start_as_current_span("normalize_and_map") as span:
        span.set_attribute("doc_id", doc_id)
        span.set_attribute("tenant_id", tenant_id)

        try:
-            # Check if extraction results exist
-            extraction_results = await document_storage.get_extraction_result(
-                tenant_id, doc_id
-            )
-            if not extraction_results:
-                raise HTTPException(
-                    status_code=404, detail="Extraction results not found"
-                )
+            # 1. Normalize data
+            normalized_data = await _normalize_data(extracted_fields)

-            # Generate normalization ID
-            normalization_id = str(ulid.new())
-            span.set_attribute("normalization_id", normalization_id)
-
-            # Start background normalization
-            background_tasks.add_task(
-                _normalize_and_map_async,
-                doc_id,
-                tenant_id,
-                extraction_results,
-                normalization_id,
-                current_user.get("sub", "system"),
+            # 2. Map to KG ontology
+            kg_upsert_payload = await _map_to_kg_ontology(
+                doc_id, tenant_id, normalized_data, provenance
            )

-            logger.info(
-                "Normalization started",
-                doc_id=doc_id,
-                normalization_id=normalization_id,
+            # 3. Publish kg.upsert.ready event
+            event_payload = EventPayload(
+                data=kg_upsert_payload,
+                actor=payload.actor,
+                tenant_id=tenant_id,
+                trace_id=str(span.get_span_context().trace_id),
            )
+            await event_bus.publish(EventTopics.KG_UPSERT_READY, event_payload)  # type: ignore

-            return {
-                "normalization_id": normalization_id,
-                "doc_id": doc_id,
-                "status": "processing",
-            }
-
-        except HTTPException:
-            raise
-        except Exception as e:
-            logger.error("Failed to start normalization", doc_id=doc_id, error=str(e))
-            raise HTTPException(status_code=500, detail="Failed to start normalization")
-
-
-async def _handle_extraction_completed(topic: str, payload: EventPayload) -> None:
-    """Handle extraction completion events"""
-    try:
-        data = payload.data
-        doc_id = data.get("doc_id")
-        tenant_id = data.get("tenant_id")
-        confidence = data.get("confidence", 0.0)
-
-        if not doc_id or not tenant_id:
-            logger.warning("Invalid extraction completion event", data=data)
-            return
-
-        # Only auto-process if confidence is above threshold
-        if confidence >= settings.confidence_threshold:
-            logger.info(
-                "Auto-normalizing extracted document",
-                doc_id=doc_id,
-                confidence=confidence,
-            )
-
-            extraction_results = data.get("extraction_results")
-            if not extraction_results:
-                extraction_results = await document_storage.get_extraction_result(
-                    tenant_id, doc_id
-                )
-
-            if extraction_results:
-                await _normalize_and_map_async(
-                    doc_id=doc_id,
-                    tenant_id=tenant_id,
-                    extraction_results=extraction_results,
-                    normalization_id=str(ulid.new()),
-                    actor=payload.actor,
-                )
-        else:
-            logger.info(
-                "Skipping auto-normalization due to low confidence",
-                doc_id=doc_id,
-                confidence=confidence,
-            )
-
-    except Exception as e:
-        logger.error("Failed to handle extraction completion", error=str(e))
-
-
-async def _normalize_and_map_async(
-    doc_id: str,
-    tenant_id: str,
-    extraction_results: dict[str, Any],
-    normalization_id: str,
-    actor: str,
-) -> None:
-    """Normalize and map data asynchronously"""
-
-    with tracer.start_as_current_span("normalize_and_map_async") as span:
-        span.set_attribute("doc_id", doc_id)
-        span.set_attribute("normalization_id", normalization_id)
-
-        try:
-            extracted_fields = extraction_results.get("extracted_fields", {})
-            provenance = extraction_results.get("provenance", [])
-
-            # Normalize extracted data
-            normalized_data = await _normalize_data(extracted_fields, provenance)
-
-            # Map to knowledge graph entities
-            entities = await _map_to_entities(normalized_data, doc_id, tenant_id)
-
-            # Store entities in knowledge graph
-            stored_entities = await _store_entities(entities, tenant_id)
-
-            # Create normalization results
-            normalization_results = {
-                "doc_id": doc_id,
-                "normalization_id": normalization_id,
-                "normalized_at": datetime.utcnow().isoformat(),
-                "normalized_data": normalized_data,
-                "entities": stored_entities,
-                "entity_count": len(stored_entities),
-            }
-
-            logger.info("Normalization completed", results=normalization_results)
-
-            # Update metrics
-            metrics.counter("documents_normalized_total").labels(
+            metrics.counter("normalized_documents_total").labels(
                tenant_id=tenant_id
            ).inc()
-
-            metrics.histogram("entities_created").labels(tenant_id=tenant_id).observe(
-                len(stored_entities)
-            )
-
-            # Publish completion event
-            event_payload = EventPayload(
-                data={
-                    "doc_id": doc_id,
-                    "tenant_id": tenant_id,
-                    "normalization_id": normalization_id,
-                    "entity_count": len(stored_entities),
-                    "entities": stored_entities,
-                },
-                actor=actor,
-                tenant_id=tenant_id,
-            )
-
-            await event_bus.publish(EventTopics.KG_UPSERTED, event_payload)
-
            logger.info(
-                "Normalization completed", doc_id=doc_id, entities=len(stored_entities)
+                "Document normalized and mapped", doc_id=doc_id, tenant_id=tenant_id
            )

        except Exception as e:
-            logger.error("Normalization failed", doc_id=doc_id, error=str(e))
-
-            # Update error metrics
+            logger.error(
+                "Failed to normalize and map document", doc_id=doc_id, error=str(e)
+            )
            metrics.counter("normalization_errors_total").labels(
                tenant_id=tenant_id, error_type=type(e).__name__
            ).inc()


-async def _normalize_data(
-    extracted_fields: dict[str, Any], provenance: list[dict[str, Any]]
-) -> dict[str, Any]:
-    """Normalize extracted data"""
-
-    normalized = {}
-
-    for field_name, raw_value in extracted_fields.items():
-        try:
-            if "amount" in field_name.lower() or "total" in field_name.lower():
-                normalized[field_name] = _normalize_amount(raw_value)
-            elif "date" in field_name.lower():
-                normalized[field_name] = _normalize_date(raw_value)
-            elif "name" in field_name.lower():
-                normalized[field_name] = _normalize_name(raw_value)
-            elif "address" in field_name.lower():
-                normalized[field_name] = _normalize_address(raw_value)
-            elif "number" in field_name.lower():
-                normalized[field_name] = _normalize_number(raw_value)
-            else:
-                normalized[field_name] = _normalize_text(raw_value)
-
-        except Exception as e:
-            logger.warning(
-                "Failed to normalize field",
-                field=field_name,
-                value=raw_value,
-                error=str(e),
-            )
-            normalized[field_name] = raw_value  # Keep original value
-
-    return normalized
-
-
-def _normalize_amount(value: str) -> dict[str, Any]:
-    """Normalize monetary amount"""
-    import re
-
-    if not value:
-        return {"amount": None, "currency": settings.currency_default}
-
-    # Remove currency symbols and formatting
-    clean_value = re.sub(r"[£$€,\s]", "", str(value))
-
-    try:
-        amount = Decimal(clean_value)
-
-        # Validate amount
-        if amount > settings.max_amount:
-            logger.warning("Amount exceeds maximum", amount=amount)
-
-        return {
-            "amount": float(amount),
-            "currency": settings.currency_default,
-            "original": value,
-        }
-    except Exception:
-        return {
-            "amount": None,
-            "currency": settings.currency_default,
-            "original": value,
-        }
-
-
-def _normalize_date(value: str) -> dict[str, Any]:
-    """Normalize date"""
-    from dateutil import parser
-
-    if not value:
-        return {"date": None, "original": value}
-
-    try:
-        # Try parsing with dateutil first
-        parsed_date = parser.parse(str(value), dayfirst=True)
-        return {"date": parsed_date.date().isoformat(), "original": value}
-    except Exception:
-        # Try manual formats
-        for fmt in settings.date_formats:
+async def _normalize_data(extracted_fields: dict[str, Any]) -> dict[str, Any]:
+    """Normalize extracted data into a consistent format"""
+    normalized_data = {}
+    for key, value in extracted_fields.items():
+        # Example: Simple date normalization (can be expanded)
+        if "date" in key.lower() and isinstance(value, str):
            try:
-                parsed_date = datetime.strptime(str(value), fmt)
-                return {"date": parsed_date.date().isoformat(), "original": value}
-            except Exception:
-                continue
-
-        return {"date": None, "original": value}
+                # Attempt to parse various date formats
+                # Add more robust date parsing logic here as needed
+                normalized_data[key] = datetime.fromisoformat(value).date().isoformat()
+            except ValueError:
+                normalized_data[key] = value  # Keep original if parsing fails
+        elif "amount" in key.lower() and isinstance(value, str):
+            # Example: Normalize currency to a Decimal
+            try:
+                normalized_data[key] = float(value.replace("£", "").replace(",", ""))
+            except ValueError:
+                normalized_data[key] = value
+        else:
+            normalized_data[key] = value
+    return normalized_data


-def _normalize_name(value: str) -> dict[str, Any]:
-    """Normalize person/company name"""
-    if not value:
-        return {"name": None, "original": value}
+async def _map_to_kg_ontology(
+    doc_id: str,
+    tenant_id: str,
+    normalized_data: dict[str, Any],
+    provenance: list[dict[str, Any]],
+) -> dict[str, Any]:
+    """Map normalized data to Knowledge Graph ontology nodes and relationships based on kg_schema.json"""
+    nodes = []
+    relationships = []
+    now = datetime.now(UTC).isoformat()

-    # Clean and title case
-    clean_name = str(value).strip().title()
+    # Create a Document node
+    doc_node_id = f"document_{doc_id}"
+    nodes.append(
+        {
+            "id": doc_node_id,
+            "type": "Document",
+            "properties": {
+                "node_type": "Document",
+                "doc_id": doc_id,
+                "kind": normalized_data.get("kind", "OtherSupportingDoc"),
+                "source": normalized_data.get("source", "manual_upload"),
+                "checksum": normalized_data.get("checksum", ""),
+                "valid_from": now,
+                "asserted_at": now,
+                # "source": "svc-normalize-map",
+                "extractor_version": "1.0.0",
+            },
+        }
+    )

-    # Detect if it's a company (contains Ltd, Limited, etc.)
-    company_indicators = ["Ltd", "Limited", "Plc", "Inc", "Corp", "Company"]
-    is_company = any(indicator in clean_name for indicator in company_indicators)
+    # Create a TaxpayerProfile node
+    taxpayer_id = normalized_data.get("taxpayer_id", "unknown_taxpayer")
+    taxpayer_node_id = f"taxpayer_{taxpayer_id}"
+    nodes.append(
+        {
+            "id": taxpayer_node_id,
+            "type": "TaxpayerProfile",
+            "properties": {
+                "node_type": "TaxpayerProfile",
+                "taxpayer_id": taxpayer_id,
+                "type": "Individual",
+                "valid_from": now,
+                "asserted_at": now,
+                "source": "svc-normalize-map",
+                "extractor_version": "1.0.0",
+            },
+        }
+    )
+
+    relationships.append(
+        {
+            "id": f"rel_document_to_taxpayer_{doc_id}",
+            "type": "BELONGS_TO",
+            "sourceId": doc_node_id,
+            "targetId": taxpayer_node_id,
+            "properties": {},
+        }
+    )
+
+    # Create IncomeItem/ExpenseItem nodes and Evidence nodes
+    item_type = (
+        "IncomeItem" if normalized_data.get("kind") == "invoice" else "ExpenseItem"
+    )
+
+    for field, value in normalized_data.items():
+        if field in ["total_amount", "net_amount", "vat_amount", "amount"]:
+            item_id = f"item_{ulid.new()}"
+            item_node_id = f"{item_type.lower()}_{item_id}"
+
+            # Create the financial item node (IncomeItem or ExpenseItem)
+            nodes.append(
+                {
+                    "id": item_node_id,
+                    "type": item_type,
+                    "properties": {
+                        "node_type": item_type,
+                        "type": (
+                            "self_employment"
+                            if "invoice" in normalized_data.get("kind", "")
+                            else "other"
+                        ),
+                        "gross": value,
+                        "currency": "GBP",
+                        "description": normalized_data.get("description", field),
+                        "valid_from": now,
+                        "asserted_at": now,
+                        "source": "svc-normalize-map",
+                        "extractor_version": "1.0.0",
+                    },
+                }
+            )
+
+            relationships.append(
+                {
+                    "id": f"rel_taxpayer_has_{item_type.lower()}_{item_id}",
+                    "type": (
+                        "HAS_INCOME" if item_type == "IncomeItem" else "HAS_EXPENSE"
+                    ),
+                    "sourceId": taxpayer_node_id,
+                    "targetId": item_node_id,
+                    "properties": {},
+                }
+            )
+
+            # Create an Evidence node linking the item to the document
+            prov = next((p for p in provenance if p["field"] == field), None)
+            if prov:
+                evidence_id = f"evidence_{item_id}"
+                nodes.append(
+                    {
+                        "id": evidence_id,
+                        "type": "Evidence",
+                        "properties": {
+                            "node_type": "Evidence",
+                            "snippet_id": evidence_id,
+                            "doc_ref": doc_id,
+                            "page": prov.get("page"),
+                            "bbox": prov.get("bbox"),
+                            "text_hash": "dummy_hash",  # Placeholder
+                            "ocr_confidence": prov.get("confidence"),
+                            "extracted_text": str(value),
+                            "valid_from": now,
+                            "asserted_at": now,
+                            "source": "svc-normalize-map",
+                            "extractor_version": "1.0.0",
+                        },
+                    }
+                )
+
+                relationships.append(
+                    {
+                        "id": f"rel_item_supported_by_evidence_{item_id}",
+                        "type": "SUPPORTED_BY",
+                        "sourceId": item_node_id,
+                        "targetId": evidence_id,
+                        "properties": {},
+                    }
+                )

    return {
-        "name": clean_name,
-        "type": "company" if is_company else "person",
-        "original": value,
+        "nodes": nodes,
+        "relationships": relationships,
+        "document_id": doc_id,
+        "tenant_id": tenant_id,
    }


-def _normalize_address(value: str) -> dict[str, Any]:
-    """Normalize address"""
-    import re
-
-    if not value:
-        return {"address": None, "original": value}
-
-    clean_address = str(value).strip()
-
-    # Extract UK postcode
-    postcode_pattern = r"\b[A-Z]{1,2}\d[A-Z0-9]?\s*\d[A-Z]{2}\b"
-    postcode_match = re.search(postcode_pattern, clean_address, re.IGNORECASE)
-    postcode = postcode_match.group().upper() if postcode_match else None
-
-    return {"address": clean_address, "postcode": postcode, "original": value}
-
-
-def _normalize_number(value: str) -> dict[str, Any]:
-    """Normalize reference numbers"""
-    import re
-
-    if not value:
-        return {"number": None, "original": value}
-
-    # Remove spaces and special characters
-    clean_number = re.sub(r"[^\w]", "", str(value))
-
-    # Detect number type
-    number_type = "unknown"
-    if len(clean_number) == 10 and clean_number.isdigit():
-        number_type = "utr"  # UTR is 10 digits
-    elif len(clean_number) == 8 and clean_number.isdigit():
-        number_type = "account_number"
-    elif re.match(r"^\d{6}$", clean_number):
-        number_type = "sort_code"
-
-    return {"number": clean_number, "type": number_type, "original": value}
-
-
-def _normalize_text(value: str) -> dict[str, Any]:
-    """Normalize general text"""
-    if not value:
-        return {"text": None, "original": value}
-
-    clean_text = str(value).strip()
-
-    return {"text": clean_text, "original": value}
-
-
-async def _map_to_entities(
-    normalized_data: dict[str, Any], doc_id: str, tenant_id: str
-) -> list[dict[str, Any]]:
-    """Map normalized data to knowledge graph entities"""
-
-    entities = []
-
-    # Create document entity
-    doc_entity = {
-        "type": "Document",
-        "id": doc_id,
-        "properties": {
-            "doc_id": doc_id,
-            "tenant_id": tenant_id,
-            "processed_at": datetime.utcnow().isoformat(),
-            "source": "extraction",
-            "extractor_version": "1.0.0",
-            "valid_from": datetime.utcnow(),
-            "asserted_at": datetime.utcnow(),
-        },
-    }
-    entities.append(doc_entity)
-
-    # Map specific field types to entities
-    for field_name, normalized_value in normalized_data.items():
-        if isinstance(normalized_value, dict):
-            if "amount" in normalized_value and normalized_value["amount"] is not None:
-                # Create expense or income item
-                entity_type = (
-                    "ExpenseItem" if "expense" in field_name.lower() else "IncomeItem"
-                )
-                entity = {
-                    "type": entity_type,
-                    "id": f"{entity_type.lower()}_{ulid.new()}",
-                    "properties": {
-                        "amount": normalized_value["amount"],
-                        "currency": normalized_value["currency"],
-                        "description": field_name,
-                        "source": doc_id,
-                        "extractor_version": "1.0.0",
-                        "valid_from": datetime.utcnow(),
-                        "asserted_at": datetime.utcnow(),
-                    },
-                }
-                entities.append(entity)
-
-            elif "name" in normalized_value and normalized_value["name"] is not None:
-                # Create party entity
-                entity = {
-                    "type": "Party",
-                    "id": f"party_{ulid.new()}",
-                    "properties": {
-                        "name": normalized_value["name"],
-                        "party_type": normalized_value.get("type", "unknown"),
-                        "source": doc_id,
-                        "extractor_version": "1.0.0",
-                        "valid_from": datetime.utcnow(),
-                        "asserted_at": datetime.utcnow(),
-                    },
-                }
-                entities.append(entity)
-
-    return entities
-
-
-async def _store_entities(
-    entities: list[dict[str, Any]], tenant_id: str
-) -> list[dict[str, Any]]:
-    """Store entities in knowledge graph"""
-
-    stored_entities = []
-
-    for entity in entities:
-        try:
-            # Create node in Neo4j
-            result = await neo4j_client.create_node(
-                label=entity["type"], properties=entity["properties"]
-            )
-
-            stored_entities.append(
-                {
-                    "type": entity["type"],
-                    "id": entity["id"],
-                    "neo4j_id": result.get("id"),
-                    "properties": entity["properties"],
-                }
-            )
-
-            logger.debug("Entity stored", type=entity["type"], id=entity["id"])
-
-        except Exception as e:
-            logger.error("Failed to store entity", entity=entity, error=str(e))
-
-    return stored_entities
-
-
-
@app.exception_handler(HTTPException)
 async def http_exception_handler(request: Request, exc: HTTPException) -> JSONResponse:
    """Handle HTTP exceptions with RFC7807 format"""
@@ -579,8 +330,8 @@ async def http_exception_handler(request: Request, exc: HTTPException) -> JSONRe
            status=exc.status_code,
            detail=exc.detail,
            instance=str(request.url),
-            trace_id="",
-        ).dict(),
+            trace_id=getattr(request.state, "trace_id", None),
+        ).model_dump(),
    )


--- a/apps/svc_normalize_map/requirements.txt
+++ b/apps/svc_normalize_map/requirements.txt
@@ -1,37 +1 @@
-# FastAPI and server
-fastapi>=0.118.3
-uvicorn[standard]>=0.37.0
-pydantic>=2.12.0
-
-# Service-specific dependencies
-# Data normalization and cleaning
-pandas>=2.3.3
-numpy>=2.3.3
-
-# Currency and exchange rates
-forex-python>=1.9.2
-babel>=2.17.0
-
-# Date and time processing
-python-dateutil>=2.9.0
-pytz>=2025.2
-
-# Text normalization
-unidecode>=1.4.0
-phonenumbers>=9.0.16
-
-# Entity resolution and matching
-recordlinkage>=0.16.0
-fuzzywuzzy>=0.18.0
-python-Levenshtein>=0.27.1
-
-# Geographic data
-geopy>=2.4.1
-pycountry>=24.6.1
-
-# Data validation
-cerberus>=1.3.7
-marshmallow>=4.0.1
-
-# UK-specific utilities
-uk-postcode-utils>=1.1
+python-ulid