"""Data normalization and knowledge graph mapping.""" # FILE: apps/svc-normalize-map/main.py # pylint: disable=wrong-import-position,import-error,too-few-public-methods,global-statement # pylint: disable=global-variable-not-assigned,raise-missing-from,unused-argument # pylint: disable=broad-exception-caught,no-else-return,too-many-arguments,too-many-positional-arguments # pylint: disable=too-many-locals,import-outside-toplevel,too-many-statements # mypy: disable-error-code=union-attr import os # Import shared libraries import sys from datetime import datetime from decimal import Decimal from typing import Any import structlog import ulid from fastapi import BackgroundTasks, Depends, HTTPException, Request from fastapi.responses import JSONResponse sys.path.append(os.path.join(os.path.dirname(__file__), "..", "..")) from libs.app_factory import create_app from libs.config import ( BaseAppSettings, create_event_bus, create_minio_client, create_neo4j_client, ) from libs.events import EventBus, EventPayload, EventTopics from libs.neo import Neo4jClient from libs.observability import get_metrics, get_tracer, setup_observability from libs.schemas import ErrorResponse from libs.security import get_current_user, get_tenant_id from libs.storage import DocumentStorage, StorageClient logger = structlog.get_logger() class NormalizeMapSettings(BaseAppSettings): """Settings for normalize-map service""" service_name: str = "svc-normalize-map" # Normalization configuration currency_default: str = "GBP" date_formats: list[str] = [ "%Y-%m-%d", "%d/%m/%Y", "%d-%m-%Y", "%d %B %Y", "%d %b %Y", "%B %d, %Y", ] # Mapping configuration confidence_threshold: float = 0.7 auto_create_entities: bool = True # Validation rules max_amount: float = 1000000.0 # £1M min_confidence: float = 0.5 # Create app and settings app, settings = create_app( service_name="svc-normalize-map", title="Tax Agent Normalize-Map Service", description="Data normalization and knowledge graph mapping service", settings_class=NormalizeMapSettings, ) # Global clients storage_client: StorageClient | None = None document_storage: DocumentStorage | None = None neo4j_client: Neo4jClient | None = None event_bus: EventBus | None = None tracer = get_tracer("svc-normalize-map") metrics = get_metrics() @app.on_event("startup") async def startup_event() -> None: """Initialize service dependencies""" global storage_client, document_storage, neo4j_client, event_bus logger.info("Starting normalize-map service") # Setup observability setup_observability(settings) # Initialize MinIO client minio_client = create_minio_client(settings) storage_client = StorageClient(minio_client) document_storage = DocumentStorage(storage_client) # Initialize Neo4j client neo4j_driver = create_neo4j_client(settings) neo4j_client = Neo4jClient(neo4j_driver) # Initialize event bus event_bus = create_event_bus(settings) await event_bus.start() # Subscribe to extraction completion events await event_bus.subscribe( # type: ignore EventTopics.DOC_EXTRACTED, _handle_extraction_completed ) logger.info("Normalize-map service started successfully") @app.on_event("shutdown") async def shutdown_event() -> None: """Cleanup service dependencies""" global event_bus, neo4j_client logger.info("Shutting down normalize-map service") if neo4j_client: await neo4j_client.close() if event_bus: await event_bus.stop() logger.info("Normalize-map service shutdown complete") @app.get("/health") async def health_check() -> dict[str, Any]: """Health check endpoint""" return { "status": "healthy", "service": settings.service_name, "version": settings.service_version, "timestamp": datetime.utcnow().isoformat(), } @app.post("/normalize/{doc_id}") async def normalize_document( doc_id: str, background_tasks: BackgroundTasks, current_user: dict[str, Any] = Depends(get_current_user), tenant_id: str = Depends(get_tenant_id), ) -> dict[str, Any]: """Normalize and map document data to knowledge graph""" with tracer.start_as_current_span("normalize_document") as span: span.set_attribute("doc_id", doc_id) span.set_attribute("tenant_id", tenant_id) try: # Check if extraction results exist extraction_results = await document_storage.get_extraction_result( tenant_id, doc_id ) if not extraction_results: raise HTTPException( status_code=404, detail="Extraction results not found" ) # Generate normalization ID normalization_id = str(ulid.new()) span.set_attribute("normalization_id", normalization_id) # Start background normalization background_tasks.add_task( _normalize_and_map_async, doc_id, tenant_id, extraction_results, normalization_id, current_user.get("sub", "system"), ) logger.info( "Normalization started", doc_id=doc_id, normalization_id=normalization_id, ) return { "normalization_id": normalization_id, "doc_id": doc_id, "status": "processing", } except HTTPException: raise except Exception as e: logger.error("Failed to start normalization", doc_id=doc_id, error=str(e)) raise HTTPException(status_code=500, detail="Failed to start normalization") async def _handle_extraction_completed(topic: str, payload: EventPayload) -> None: """Handle extraction completion events""" try: data = payload.data doc_id = data.get("doc_id") tenant_id = data.get("tenant_id") confidence = data.get("confidence", 0.0) if not doc_id or not tenant_id: logger.warning("Invalid extraction completion event", data=data) return # Only auto-process if confidence is above threshold if confidence >= settings.confidence_threshold: logger.info( "Auto-normalizing extracted document", doc_id=doc_id, confidence=confidence, ) extraction_results = data.get("extraction_results") if not extraction_results: extraction_results = await document_storage.get_extraction_result( tenant_id, doc_id ) if extraction_results: await _normalize_and_map_async( doc_id=doc_id, tenant_id=tenant_id, extraction_results=extraction_results, normalization_id=str(ulid.new()), actor=payload.actor, ) else: logger.info( "Skipping auto-normalization due to low confidence", doc_id=doc_id, confidence=confidence, ) except Exception as e: logger.error("Failed to handle extraction completion", error=str(e)) async def _normalize_and_map_async( doc_id: str, tenant_id: str, extraction_results: dict[str, Any], normalization_id: str, actor: str, ) -> None: """Normalize and map data asynchronously""" with tracer.start_as_current_span("normalize_and_map_async") as span: span.set_attribute("doc_id", doc_id) span.set_attribute("normalization_id", normalization_id) try: extracted_fields = extraction_results.get("extracted_fields", {}) provenance = extraction_results.get("provenance", []) # Normalize extracted data normalized_data = await _normalize_data(extracted_fields, provenance) # Map to knowledge graph entities entities = await _map_to_entities(normalized_data, doc_id, tenant_id) # Store entities in knowledge graph stored_entities = await _store_entities(entities, tenant_id) # Create normalization results normalization_results = { "doc_id": doc_id, "normalization_id": normalization_id, "normalized_at": datetime.utcnow().isoformat(), "normalized_data": normalized_data, "entities": stored_entities, "entity_count": len(stored_entities), } logger.info("Normalization completed", results=normalization_results) # Update metrics metrics.counter("documents_normalized_total").labels( tenant_id=tenant_id ).inc() metrics.histogram("entities_created").labels(tenant_id=tenant_id).observe( len(stored_entities) ) # Publish completion event event_payload = EventPayload( data={ "doc_id": doc_id, "tenant_id": tenant_id, "normalization_id": normalization_id, "entity_count": len(stored_entities), "entities": stored_entities, }, actor=actor, tenant_id=tenant_id, ) await event_bus.publish(EventTopics.KG_UPSERTED, event_payload) logger.info( "Normalization completed", doc_id=doc_id, entities=len(stored_entities) ) except Exception as e: logger.error("Normalization failed", doc_id=doc_id, error=str(e)) # Update error metrics metrics.counter("normalization_errors_total").labels( tenant_id=tenant_id, error_type=type(e).__name__ ).inc() async def _normalize_data( extracted_fields: dict[str, Any], provenance: list[dict[str, Any]] ) -> dict[str, Any]: """Normalize extracted data""" normalized = {} for field_name, raw_value in extracted_fields.items(): try: if "amount" in field_name.lower() or "total" in field_name.lower(): normalized[field_name] = _normalize_amount(raw_value) elif "date" in field_name.lower(): normalized[field_name] = _normalize_date(raw_value) elif "name" in field_name.lower(): normalized[field_name] = _normalize_name(raw_value) elif "address" in field_name.lower(): normalized[field_name] = _normalize_address(raw_value) elif "number" in field_name.lower(): normalized[field_name] = _normalize_number(raw_value) else: normalized[field_name] = _normalize_text(raw_value) except Exception as e: logger.warning( "Failed to normalize field", field=field_name, value=raw_value, error=str(e), ) normalized[field_name] = raw_value # Keep original value return normalized def _normalize_amount(value: str) -> dict[str, Any]: """Normalize monetary amount""" import re if not value: return {"amount": None, "currency": settings.currency_default} # Remove currency symbols and formatting clean_value = re.sub(r"[£$€,\s]", "", str(value)) try: amount = Decimal(clean_value) # Validate amount if amount > settings.max_amount: logger.warning("Amount exceeds maximum", amount=amount) return { "amount": float(amount), "currency": settings.currency_default, "original": value, } except Exception: return { "amount": None, "currency": settings.currency_default, "original": value, } def _normalize_date(value: str) -> dict[str, Any]: """Normalize date""" from dateutil import parser if not value: return {"date": None, "original": value} try: # Try parsing with dateutil first parsed_date = parser.parse(str(value), dayfirst=True) return {"date": parsed_date.date().isoformat(), "original": value} except Exception: # Try manual formats for fmt in settings.date_formats: try: parsed_date = datetime.strptime(str(value), fmt) return {"date": parsed_date.date().isoformat(), "original": value} except Exception: continue return {"date": None, "original": value} def _normalize_name(value: str) -> dict[str, Any]: """Normalize person/company name""" if not value: return {"name": None, "original": value} # Clean and title case clean_name = str(value).strip().title() # Detect if it's a company (contains Ltd, Limited, etc.) company_indicators = ["Ltd", "Limited", "Plc", "Inc", "Corp", "Company"] is_company = any(indicator in clean_name for indicator in company_indicators) return { "name": clean_name, "type": "company" if is_company else "person", "original": value, } def _normalize_address(value: str) -> dict[str, Any]: """Normalize address""" import re if not value: return {"address": None, "original": value} clean_address = str(value).strip() # Extract UK postcode postcode_pattern = r"\b[A-Z]{1,2}\d[A-Z0-9]?\s*\d[A-Z]{2}\b" postcode_match = re.search(postcode_pattern, clean_address, re.IGNORECASE) postcode = postcode_match.group().upper() if postcode_match else None return {"address": clean_address, "postcode": postcode, "original": value} def _normalize_number(value: str) -> dict[str, Any]: """Normalize reference numbers""" import re if not value: return {"number": None, "original": value} # Remove spaces and special characters clean_number = re.sub(r"[^\w]", "", str(value)) # Detect number type number_type = "unknown" if len(clean_number) == 10 and clean_number.isdigit(): number_type = "utr" # UTR is 10 digits elif len(clean_number) == 8 and clean_number.isdigit(): number_type = "account_number" elif re.match(r"^\d{6}$", clean_number): number_type = "sort_code" return {"number": clean_number, "type": number_type, "original": value} def _normalize_text(value: str) -> dict[str, Any]: """Normalize general text""" if not value: return {"text": None, "original": value} clean_text = str(value).strip() return {"text": clean_text, "original": value} async def _map_to_entities( normalized_data: dict[str, Any], doc_id: str, tenant_id: str ) -> list[dict[str, Any]]: """Map normalized data to knowledge graph entities""" entities = [] # Create document entity doc_entity = { "type": "Document", "id": doc_id, "properties": { "doc_id": doc_id, "tenant_id": tenant_id, "processed_at": datetime.utcnow().isoformat(), "source": "extraction", "extractor_version": "1.0.0", "valid_from": datetime.utcnow(), "asserted_at": datetime.utcnow(), }, } entities.append(doc_entity) # Map specific field types to entities for field_name, normalized_value in normalized_data.items(): if isinstance(normalized_value, dict): if "amount" in normalized_value and normalized_value["amount"] is not None: # Create expense or income item entity_type = ( "ExpenseItem" if "expense" in field_name.lower() else "IncomeItem" ) entity = { "type": entity_type, "id": f"{entity_type.lower()}_{ulid.new()}", "properties": { "amount": normalized_value["amount"], "currency": normalized_value["currency"], "description": field_name, "source": doc_id, "extractor_version": "1.0.0", "valid_from": datetime.utcnow(), "asserted_at": datetime.utcnow(), }, } entities.append(entity) elif "name" in normalized_value and normalized_value["name"] is not None: # Create party entity entity = { "type": "Party", "id": f"party_{ulid.new()}", "properties": { "name": normalized_value["name"], "party_type": normalized_value.get("type", "unknown"), "source": doc_id, "extractor_version": "1.0.0", "valid_from": datetime.utcnow(), "asserted_at": datetime.utcnow(), }, } entities.append(entity) return entities async def _store_entities( entities: list[dict[str, Any]], tenant_id: str ) -> list[dict[str, Any]]: """Store entities in knowledge graph""" stored_entities = [] for entity in entities: try: # Create node in Neo4j result = await neo4j_client.create_node( label=entity["type"], properties=entity["properties"] ) stored_entities.append( { "type": entity["type"], "id": entity["id"], "neo4j_id": result.get("id"), "properties": entity["properties"], } ) logger.debug("Entity stored", type=entity["type"], id=entity["id"]) except Exception as e: logger.error("Failed to store entity", entity=entity, error=str(e)) return stored_entities @app.exception_handler(HTTPException) async def http_exception_handler(request: Request, exc: HTTPException) -> JSONResponse: """Handle HTTP exceptions with RFC7807 format""" return JSONResponse( status_code=exc.status_code, content=ErrorResponse( type=f"https://httpstatuses.com/{exc.status_code}", title=exc.detail, status=exc.status_code, detail=exc.detail, instance=str(request.url), trace_id="", ).dict(), ) if __name__ == "__main__": import uvicorn uvicorn.run("main:app", host="0.0.0.0", port=8004, reload=True, log_config=None)