Files
ai-tax-agent/apps/svc_normalize_map/main.py
harkon b324ff09ef
Some checks failed
CI/CD Pipeline / Code Quality & Linting (push) Has been cancelled
CI/CD Pipeline / Policy Validation (push) Has been cancelled
CI/CD Pipeline / Test Suite (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-firm-connectors) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-forms) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-hmrc) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ingestion) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-normalize-map) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ocr) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-indexer) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-reason) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rpa) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (ui-review) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (ui-review) (push) Has been cancelled
CI/CD Pipeline / Generate SBOM (push) Has been cancelled
CI/CD Pipeline / Deploy to Staging (push) Has been cancelled
CI/CD Pipeline / Deploy to Production (push) Has been cancelled
CI/CD Pipeline / Notifications (push) Has been cancelled
Initial commit
2025-10-11 08:41:36 +01:00

591 lines
18 KiB
Python

"""Data normalization and knowledge graph mapping."""
# FILE: apps/svc-normalize-map/main.py
# pylint: disable=wrong-import-position,import-error,too-few-public-methods,global-statement
# pylint: disable=global-variable-not-assigned,raise-missing-from,unused-argument
# pylint: disable=broad-exception-caught,no-else-return,too-many-arguments,too-many-positional-arguments
# pylint: disable=too-many-locals,import-outside-toplevel,too-many-statements
# mypy: disable-error-code=union-attr
import os
# Import shared libraries
import sys
from datetime import datetime
from decimal import Decimal
from typing import Any
import structlog
import ulid
from fastapi import BackgroundTasks, Depends, HTTPException, Request
from fastapi.responses import JSONResponse
sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
from libs.app_factory import create_app
from libs.config import (
BaseAppSettings,
create_event_bus,
create_minio_client,
create_neo4j_client,
)
from libs.events import EventBus, EventPayload, EventTopics
from libs.neo import Neo4jClient
from libs.observability import get_metrics, get_tracer, setup_observability
from libs.schemas import ErrorResponse
from libs.security import get_current_user, get_tenant_id
from libs.storage import DocumentStorage, StorageClient
logger = structlog.get_logger()
class NormalizeMapSettings(BaseAppSettings):
"""Settings for normalize-map service"""
service_name: str = "svc-normalize-map"
# Normalization configuration
currency_default: str = "GBP"
date_formats: list[str] = [
"%Y-%m-%d",
"%d/%m/%Y",
"%d-%m-%Y",
"%d %B %Y",
"%d %b %Y",
"%B %d, %Y",
]
# Mapping configuration
confidence_threshold: float = 0.7
auto_create_entities: bool = True
# Validation rules
max_amount: float = 1000000.0 # £1M
min_confidence: float = 0.5
# Create app and settings
app, settings = create_app(
service_name="svc-normalize-map",
title="Tax Agent Normalize-Map Service",
description="Data normalization and knowledge graph mapping service",
settings_class=NormalizeMapSettings,
)
# Global clients
storage_client: StorageClient | None = None
document_storage: DocumentStorage | None = None
neo4j_client: Neo4jClient | None = None
event_bus: EventBus | None = None
tracer = get_tracer("svc-normalize-map")
metrics = get_metrics()
@app.on_event("startup")
async def startup_event() -> None:
"""Initialize service dependencies"""
global storage_client, document_storage, neo4j_client, event_bus
logger.info("Starting normalize-map service")
# Setup observability
setup_observability(settings)
# Initialize MinIO client
minio_client = create_minio_client(settings)
storage_client = StorageClient(minio_client)
document_storage = DocumentStorage(storage_client)
# Initialize Neo4j client
neo4j_driver = create_neo4j_client(settings)
neo4j_client = Neo4jClient(neo4j_driver)
# Initialize event bus
event_bus = create_event_bus(settings)
await event_bus.start()
# Subscribe to extraction completion events
await event_bus.subscribe( # type: ignore
EventTopics.DOC_EXTRACTED, _handle_extraction_completed
)
logger.info("Normalize-map service started successfully")
@app.on_event("shutdown")
async def shutdown_event() -> None:
"""Cleanup service dependencies"""
global event_bus, neo4j_client
logger.info("Shutting down normalize-map service")
if neo4j_client:
await neo4j_client.close()
if event_bus:
await event_bus.stop()
logger.info("Normalize-map service shutdown complete")
@app.get("/health")
async def health_check() -> dict[str, Any]:
"""Health check endpoint"""
return {
"status": "healthy",
"service": settings.service_name,
"version": settings.service_version,
"timestamp": datetime.utcnow().isoformat(),
}
@app.post("/normalize/{doc_id}")
async def normalize_document(
doc_id: str,
background_tasks: BackgroundTasks,
current_user: dict[str, Any] = Depends(get_current_user),
tenant_id: str = Depends(get_tenant_id),
) -> dict[str, Any]:
"""Normalize and map document data to knowledge graph"""
with tracer.start_as_current_span("normalize_document") as span:
span.set_attribute("doc_id", doc_id)
span.set_attribute("tenant_id", tenant_id)
try:
# Check if extraction results exist
extraction_results = await document_storage.get_extraction_result(
tenant_id, doc_id
)
if not extraction_results:
raise HTTPException(
status_code=404, detail="Extraction results not found"
)
# Generate normalization ID
normalization_id = str(ulid.new())
span.set_attribute("normalization_id", normalization_id)
# Start background normalization
background_tasks.add_task(
_normalize_and_map_async,
doc_id,
tenant_id,
extraction_results,
normalization_id,
current_user.get("sub", "system"),
)
logger.info(
"Normalization started",
doc_id=doc_id,
normalization_id=normalization_id,
)
return {
"normalization_id": normalization_id,
"doc_id": doc_id,
"status": "processing",
}
except HTTPException:
raise
except Exception as e:
logger.error("Failed to start normalization", doc_id=doc_id, error=str(e))
raise HTTPException(status_code=500, detail="Failed to start normalization")
async def _handle_extraction_completed(topic: str, payload: EventPayload) -> None:
"""Handle extraction completion events"""
try:
data = payload.data
doc_id = data.get("doc_id")
tenant_id = data.get("tenant_id")
confidence = data.get("confidence", 0.0)
if not doc_id or not tenant_id:
logger.warning("Invalid extraction completion event", data=data)
return
# Only auto-process if confidence is above threshold
if confidence >= settings.confidence_threshold:
logger.info(
"Auto-normalizing extracted document",
doc_id=doc_id,
confidence=confidence,
)
extraction_results = data.get("extraction_results")
if not extraction_results:
extraction_results = await document_storage.get_extraction_result(
tenant_id, doc_id
)
if extraction_results:
await _normalize_and_map_async(
doc_id=doc_id,
tenant_id=tenant_id,
extraction_results=extraction_results,
normalization_id=str(ulid.new()),
actor=payload.actor,
)
else:
logger.info(
"Skipping auto-normalization due to low confidence",
doc_id=doc_id,
confidence=confidence,
)
except Exception as e:
logger.error("Failed to handle extraction completion", error=str(e))
async def _normalize_and_map_async(
doc_id: str,
tenant_id: str,
extraction_results: dict[str, Any],
normalization_id: str,
actor: str,
) -> None:
"""Normalize and map data asynchronously"""
with tracer.start_as_current_span("normalize_and_map_async") as span:
span.set_attribute("doc_id", doc_id)
span.set_attribute("normalization_id", normalization_id)
try:
extracted_fields = extraction_results.get("extracted_fields", {})
provenance = extraction_results.get("provenance", [])
# Normalize extracted data
normalized_data = await _normalize_data(extracted_fields, provenance)
# Map to knowledge graph entities
entities = await _map_to_entities(normalized_data, doc_id, tenant_id)
# Store entities in knowledge graph
stored_entities = await _store_entities(entities, tenant_id)
# Create normalization results
normalization_results = {
"doc_id": doc_id,
"normalization_id": normalization_id,
"normalized_at": datetime.utcnow().isoformat(),
"normalized_data": normalized_data,
"entities": stored_entities,
"entity_count": len(stored_entities),
}
logger.info("Normalization completed", results=normalization_results)
# Update metrics
metrics.counter("documents_normalized_total").labels(
tenant_id=tenant_id
).inc()
metrics.histogram("entities_created").labels(tenant_id=tenant_id).observe(
len(stored_entities)
)
# Publish completion event
event_payload = EventPayload(
data={
"doc_id": doc_id,
"tenant_id": tenant_id,
"normalization_id": normalization_id,
"entity_count": len(stored_entities),
"entities": stored_entities,
},
actor=actor,
tenant_id=tenant_id,
)
await event_bus.publish(EventTopics.KG_UPSERTED, event_payload)
logger.info(
"Normalization completed", doc_id=doc_id, entities=len(stored_entities)
)
except Exception as e:
logger.error("Normalization failed", doc_id=doc_id, error=str(e))
# Update error metrics
metrics.counter("normalization_errors_total").labels(
tenant_id=tenant_id, error_type=type(e).__name__
).inc()
async def _normalize_data(
extracted_fields: dict[str, Any], provenance: list[dict[str, Any]]
) -> dict[str, Any]:
"""Normalize extracted data"""
normalized = {}
for field_name, raw_value in extracted_fields.items():
try:
if "amount" in field_name.lower() or "total" in field_name.lower():
normalized[field_name] = _normalize_amount(raw_value)
elif "date" in field_name.lower():
normalized[field_name] = _normalize_date(raw_value)
elif "name" in field_name.lower():
normalized[field_name] = _normalize_name(raw_value)
elif "address" in field_name.lower():
normalized[field_name] = _normalize_address(raw_value)
elif "number" in field_name.lower():
normalized[field_name] = _normalize_number(raw_value)
else:
normalized[field_name] = _normalize_text(raw_value)
except Exception as e:
logger.warning(
"Failed to normalize field",
field=field_name,
value=raw_value,
error=str(e),
)
normalized[field_name] = raw_value # Keep original value
return normalized
def _normalize_amount(value: str) -> dict[str, Any]:
"""Normalize monetary amount"""
import re
if not value:
return {"amount": None, "currency": settings.currency_default}
# Remove currency symbols and formatting
clean_value = re.sub(r"[£$€,\s]", "", str(value))
try:
amount = Decimal(clean_value)
# Validate amount
if amount > settings.max_amount:
logger.warning("Amount exceeds maximum", amount=amount)
return {
"amount": float(amount),
"currency": settings.currency_default,
"original": value,
}
except Exception:
return {
"amount": None,
"currency": settings.currency_default,
"original": value,
}
def _normalize_date(value: str) -> dict[str, Any]:
"""Normalize date"""
from dateutil import parser
if not value:
return {"date": None, "original": value}
try:
# Try parsing with dateutil first
parsed_date = parser.parse(str(value), dayfirst=True)
return {"date": parsed_date.date().isoformat(), "original": value}
except Exception:
# Try manual formats
for fmt in settings.date_formats:
try:
parsed_date = datetime.strptime(str(value), fmt)
return {"date": parsed_date.date().isoformat(), "original": value}
except Exception:
continue
return {"date": None, "original": value}
def _normalize_name(value: str) -> dict[str, Any]:
"""Normalize person/company name"""
if not value:
return {"name": None, "original": value}
# Clean and title case
clean_name = str(value).strip().title()
# Detect if it's a company (contains Ltd, Limited, etc.)
company_indicators = ["Ltd", "Limited", "Plc", "Inc", "Corp", "Company"]
is_company = any(indicator in clean_name for indicator in company_indicators)
return {
"name": clean_name,
"type": "company" if is_company else "person",
"original": value,
}
def _normalize_address(value: str) -> dict[str, Any]:
"""Normalize address"""
import re
if not value:
return {"address": None, "original": value}
clean_address = str(value).strip()
# Extract UK postcode
postcode_pattern = r"\b[A-Z]{1,2}\d[A-Z0-9]?\s*\d[A-Z]{2}\b"
postcode_match = re.search(postcode_pattern, clean_address, re.IGNORECASE)
postcode = postcode_match.group().upper() if postcode_match else None
return {"address": clean_address, "postcode": postcode, "original": value}
def _normalize_number(value: str) -> dict[str, Any]:
"""Normalize reference numbers"""
import re
if not value:
return {"number": None, "original": value}
# Remove spaces and special characters
clean_number = re.sub(r"[^\w]", "", str(value))
# Detect number type
number_type = "unknown"
if len(clean_number) == 10 and clean_number.isdigit():
number_type = "utr" # UTR is 10 digits
elif len(clean_number) == 8 and clean_number.isdigit():
number_type = "account_number"
elif re.match(r"^\d{6}$", clean_number):
number_type = "sort_code"
return {"number": clean_number, "type": number_type, "original": value}
def _normalize_text(value: str) -> dict[str, Any]:
"""Normalize general text"""
if not value:
return {"text": None, "original": value}
clean_text = str(value).strip()
return {"text": clean_text, "original": value}
async def _map_to_entities(
normalized_data: dict[str, Any], doc_id: str, tenant_id: str
) -> list[dict[str, Any]]:
"""Map normalized data to knowledge graph entities"""
entities = []
# Create document entity
doc_entity = {
"type": "Document",
"id": doc_id,
"properties": {
"doc_id": doc_id,
"tenant_id": tenant_id,
"processed_at": datetime.utcnow().isoformat(),
"source": "extraction",
"extractor_version": "1.0.0",
"valid_from": datetime.utcnow(),
"asserted_at": datetime.utcnow(),
},
}
entities.append(doc_entity)
# Map specific field types to entities
for field_name, normalized_value in normalized_data.items():
if isinstance(normalized_value, dict):
if "amount" in normalized_value and normalized_value["amount"] is not None:
# Create expense or income item
entity_type = (
"ExpenseItem" if "expense" in field_name.lower() else "IncomeItem"
)
entity = {
"type": entity_type,
"id": f"{entity_type.lower()}_{ulid.new()}",
"properties": {
"amount": normalized_value["amount"],
"currency": normalized_value["currency"],
"description": field_name,
"source": doc_id,
"extractor_version": "1.0.0",
"valid_from": datetime.utcnow(),
"asserted_at": datetime.utcnow(),
},
}
entities.append(entity)
elif "name" in normalized_value and normalized_value["name"] is not None:
# Create party entity
entity = {
"type": "Party",
"id": f"party_{ulid.new()}",
"properties": {
"name": normalized_value["name"],
"party_type": normalized_value.get("type", "unknown"),
"source": doc_id,
"extractor_version": "1.0.0",
"valid_from": datetime.utcnow(),
"asserted_at": datetime.utcnow(),
},
}
entities.append(entity)
return entities
async def _store_entities(
entities: list[dict[str, Any]], tenant_id: str
) -> list[dict[str, Any]]:
"""Store entities in knowledge graph"""
stored_entities = []
for entity in entities:
try:
# Create node in Neo4j
result = await neo4j_client.create_node(
label=entity["type"], properties=entity["properties"]
)
stored_entities.append(
{
"type": entity["type"],
"id": entity["id"],
"neo4j_id": result.get("id"),
"properties": entity["properties"],
}
)
logger.debug("Entity stored", type=entity["type"], id=entity["id"])
except Exception as e:
logger.error("Failed to store entity", entity=entity, error=str(e))
return stored_entities
@app.exception_handler(HTTPException)
async def http_exception_handler(request: Request, exc: HTTPException) -> JSONResponse:
"""Handle HTTP exceptions with RFC7807 format"""
return JSONResponse(
status_code=exc.status_code,
content=ErrorResponse(
type=f"https://httpstatuses.com/{exc.status_code}",
title=exc.detail,
status=exc.status_code,
detail=exc.detail,
instance=str(request.url),
trace_id="",
).dict(),
)
if __name__ == "__main__":
import uvicorn
uvicorn.run("main:app", host="0.0.0.0", port=8004, reload=True, log_config=None)