Some checks failed
CI/CD Pipeline / Code Quality & Linting (push) Has been cancelled
CI/CD Pipeline / Policy Validation (push) Has been cancelled
CI/CD Pipeline / Test Suite (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-firm-connectors) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-forms) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-hmrc) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ingestion) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-normalize-map) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ocr) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-indexer) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-reason) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rpa) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (ui-review) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (ui-review) (push) Has been cancelled
CI/CD Pipeline / Generate SBOM (push) Has been cancelled
CI/CD Pipeline / Deploy to Staging (push) Has been cancelled
CI/CD Pipeline / Deploy to Production (push) Has been cancelled
CI/CD Pipeline / Notifications (push) Has been cancelled
591 lines
18 KiB
Python
591 lines
18 KiB
Python
"""Data normalization and knowledge graph mapping."""
|
|
|
|
# FILE: apps/svc-normalize-map/main.py
|
|
# pylint: disable=wrong-import-position,import-error,too-few-public-methods,global-statement
|
|
# pylint: disable=global-variable-not-assigned,raise-missing-from,unused-argument
|
|
# pylint: disable=broad-exception-caught,no-else-return,too-many-arguments,too-many-positional-arguments
|
|
# pylint: disable=too-many-locals,import-outside-toplevel,too-many-statements
|
|
# mypy: disable-error-code=union-attr
|
|
|
|
|
|
import os
|
|
|
|
# Import shared libraries
|
|
import sys
|
|
from datetime import datetime
|
|
from decimal import Decimal
|
|
from typing import Any
|
|
|
|
import structlog
|
|
import ulid
|
|
from fastapi import BackgroundTasks, Depends, HTTPException, Request
|
|
from fastapi.responses import JSONResponse
|
|
|
|
sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
|
|
|
|
from libs.app_factory import create_app
|
|
from libs.config import (
|
|
BaseAppSettings,
|
|
create_event_bus,
|
|
create_minio_client,
|
|
create_neo4j_client,
|
|
)
|
|
from libs.events import EventBus, EventPayload, EventTopics
|
|
from libs.neo import Neo4jClient
|
|
from libs.observability import get_metrics, get_tracer, setup_observability
|
|
from libs.schemas import ErrorResponse
|
|
from libs.security import get_current_user, get_tenant_id
|
|
from libs.storage import DocumentStorage, StorageClient
|
|
|
|
logger = structlog.get_logger()
|
|
|
|
|
|
class NormalizeMapSettings(BaseAppSettings):
|
|
"""Settings for normalize-map service"""
|
|
|
|
service_name: str = "svc-normalize-map"
|
|
|
|
# Normalization configuration
|
|
currency_default: str = "GBP"
|
|
date_formats: list[str] = [
|
|
"%Y-%m-%d",
|
|
"%d/%m/%Y",
|
|
"%d-%m-%Y",
|
|
"%d %B %Y",
|
|
"%d %b %Y",
|
|
"%B %d, %Y",
|
|
]
|
|
|
|
# Mapping configuration
|
|
confidence_threshold: float = 0.7
|
|
auto_create_entities: bool = True
|
|
|
|
# Validation rules
|
|
max_amount: float = 1000000.0 # £1M
|
|
min_confidence: float = 0.5
|
|
|
|
|
|
# Create app and settings
|
|
app, settings = create_app(
|
|
service_name="svc-normalize-map",
|
|
title="Tax Agent Normalize-Map Service",
|
|
description="Data normalization and knowledge graph mapping service",
|
|
settings_class=NormalizeMapSettings,
|
|
)
|
|
|
|
# Global clients
|
|
storage_client: StorageClient | None = None
|
|
document_storage: DocumentStorage | None = None
|
|
neo4j_client: Neo4jClient | None = None
|
|
event_bus: EventBus | None = None
|
|
tracer = get_tracer("svc-normalize-map")
|
|
metrics = get_metrics()
|
|
|
|
|
|
@app.on_event("startup")
|
|
async def startup_event() -> None:
|
|
"""Initialize service dependencies"""
|
|
global storage_client, document_storage, neo4j_client, event_bus
|
|
|
|
logger.info("Starting normalize-map service")
|
|
|
|
# Setup observability
|
|
setup_observability(settings)
|
|
|
|
# Initialize MinIO client
|
|
minio_client = create_minio_client(settings)
|
|
storage_client = StorageClient(minio_client)
|
|
document_storage = DocumentStorage(storage_client)
|
|
|
|
# Initialize Neo4j client
|
|
neo4j_driver = create_neo4j_client(settings)
|
|
neo4j_client = Neo4jClient(neo4j_driver)
|
|
|
|
# Initialize event bus
|
|
event_bus = create_event_bus(settings)
|
|
await event_bus.start()
|
|
|
|
# Subscribe to extraction completion events
|
|
await event_bus.subscribe( # type: ignore
|
|
EventTopics.DOC_EXTRACTED, _handle_extraction_completed
|
|
)
|
|
|
|
logger.info("Normalize-map service started successfully")
|
|
|
|
|
|
@app.on_event("shutdown")
|
|
async def shutdown_event() -> None:
|
|
"""Cleanup service dependencies"""
|
|
global event_bus, neo4j_client
|
|
|
|
logger.info("Shutting down normalize-map service")
|
|
|
|
if neo4j_client:
|
|
await neo4j_client.close()
|
|
|
|
if event_bus:
|
|
await event_bus.stop()
|
|
|
|
logger.info("Normalize-map service shutdown complete")
|
|
|
|
|
|
@app.get("/health")
|
|
async def health_check() -> dict[str, Any]:
|
|
"""Health check endpoint"""
|
|
return {
|
|
"status": "healthy",
|
|
"service": settings.service_name,
|
|
"version": settings.service_version,
|
|
"timestamp": datetime.utcnow().isoformat(),
|
|
}
|
|
|
|
|
|
@app.post("/normalize/{doc_id}")
|
|
async def normalize_document(
|
|
doc_id: str,
|
|
background_tasks: BackgroundTasks,
|
|
current_user: dict[str, Any] = Depends(get_current_user),
|
|
tenant_id: str = Depends(get_tenant_id),
|
|
) -> dict[str, Any]:
|
|
"""Normalize and map document data to knowledge graph"""
|
|
|
|
with tracer.start_as_current_span("normalize_document") as span:
|
|
span.set_attribute("doc_id", doc_id)
|
|
span.set_attribute("tenant_id", tenant_id)
|
|
|
|
try:
|
|
# Check if extraction results exist
|
|
extraction_results = await document_storage.get_extraction_result(
|
|
tenant_id, doc_id
|
|
)
|
|
if not extraction_results:
|
|
raise HTTPException(
|
|
status_code=404, detail="Extraction results not found"
|
|
)
|
|
|
|
# Generate normalization ID
|
|
normalization_id = str(ulid.new())
|
|
span.set_attribute("normalization_id", normalization_id)
|
|
|
|
# Start background normalization
|
|
background_tasks.add_task(
|
|
_normalize_and_map_async,
|
|
doc_id,
|
|
tenant_id,
|
|
extraction_results,
|
|
normalization_id,
|
|
current_user.get("sub", "system"),
|
|
)
|
|
|
|
logger.info(
|
|
"Normalization started",
|
|
doc_id=doc_id,
|
|
normalization_id=normalization_id,
|
|
)
|
|
|
|
return {
|
|
"normalization_id": normalization_id,
|
|
"doc_id": doc_id,
|
|
"status": "processing",
|
|
}
|
|
|
|
except HTTPException:
|
|
raise
|
|
except Exception as e:
|
|
logger.error("Failed to start normalization", doc_id=doc_id, error=str(e))
|
|
raise HTTPException(status_code=500, detail="Failed to start normalization")
|
|
|
|
|
|
async def _handle_extraction_completed(topic: str, payload: EventPayload) -> None:
|
|
"""Handle extraction completion events"""
|
|
try:
|
|
data = payload.data
|
|
doc_id = data.get("doc_id")
|
|
tenant_id = data.get("tenant_id")
|
|
confidence = data.get("confidence", 0.0)
|
|
|
|
if not doc_id or not tenant_id:
|
|
logger.warning("Invalid extraction completion event", data=data)
|
|
return
|
|
|
|
# Only auto-process if confidence is above threshold
|
|
if confidence >= settings.confidence_threshold:
|
|
logger.info(
|
|
"Auto-normalizing extracted document",
|
|
doc_id=doc_id,
|
|
confidence=confidence,
|
|
)
|
|
|
|
extraction_results = data.get("extraction_results")
|
|
if not extraction_results:
|
|
extraction_results = await document_storage.get_extraction_result(
|
|
tenant_id, doc_id
|
|
)
|
|
|
|
if extraction_results:
|
|
await _normalize_and_map_async(
|
|
doc_id=doc_id,
|
|
tenant_id=tenant_id,
|
|
extraction_results=extraction_results,
|
|
normalization_id=str(ulid.new()),
|
|
actor=payload.actor,
|
|
)
|
|
else:
|
|
logger.info(
|
|
"Skipping auto-normalization due to low confidence",
|
|
doc_id=doc_id,
|
|
confidence=confidence,
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error("Failed to handle extraction completion", error=str(e))
|
|
|
|
|
|
async def _normalize_and_map_async(
|
|
doc_id: str,
|
|
tenant_id: str,
|
|
extraction_results: dict[str, Any],
|
|
normalization_id: str,
|
|
actor: str,
|
|
) -> None:
|
|
"""Normalize and map data asynchronously"""
|
|
|
|
with tracer.start_as_current_span("normalize_and_map_async") as span:
|
|
span.set_attribute("doc_id", doc_id)
|
|
span.set_attribute("normalization_id", normalization_id)
|
|
|
|
try:
|
|
extracted_fields = extraction_results.get("extracted_fields", {})
|
|
provenance = extraction_results.get("provenance", [])
|
|
|
|
# Normalize extracted data
|
|
normalized_data = await _normalize_data(extracted_fields, provenance)
|
|
|
|
# Map to knowledge graph entities
|
|
entities = await _map_to_entities(normalized_data, doc_id, tenant_id)
|
|
|
|
# Store entities in knowledge graph
|
|
stored_entities = await _store_entities(entities, tenant_id)
|
|
|
|
# Create normalization results
|
|
normalization_results = {
|
|
"doc_id": doc_id,
|
|
"normalization_id": normalization_id,
|
|
"normalized_at": datetime.utcnow().isoformat(),
|
|
"normalized_data": normalized_data,
|
|
"entities": stored_entities,
|
|
"entity_count": len(stored_entities),
|
|
}
|
|
|
|
logger.info("Normalization completed", results=normalization_results)
|
|
|
|
# Update metrics
|
|
metrics.counter("documents_normalized_total").labels(
|
|
tenant_id=tenant_id
|
|
).inc()
|
|
|
|
metrics.histogram("entities_created").labels(tenant_id=tenant_id).observe(
|
|
len(stored_entities)
|
|
)
|
|
|
|
# Publish completion event
|
|
event_payload = EventPayload(
|
|
data={
|
|
"doc_id": doc_id,
|
|
"tenant_id": tenant_id,
|
|
"normalization_id": normalization_id,
|
|
"entity_count": len(stored_entities),
|
|
"entities": stored_entities,
|
|
},
|
|
actor=actor,
|
|
tenant_id=tenant_id,
|
|
)
|
|
|
|
await event_bus.publish(EventTopics.KG_UPSERTED, event_payload)
|
|
|
|
logger.info(
|
|
"Normalization completed", doc_id=doc_id, entities=len(stored_entities)
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error("Normalization failed", doc_id=doc_id, error=str(e))
|
|
|
|
# Update error metrics
|
|
metrics.counter("normalization_errors_total").labels(
|
|
tenant_id=tenant_id, error_type=type(e).__name__
|
|
).inc()
|
|
|
|
|
|
async def _normalize_data(
|
|
extracted_fields: dict[str, Any], provenance: list[dict[str, Any]]
|
|
) -> dict[str, Any]:
|
|
"""Normalize extracted data"""
|
|
|
|
normalized = {}
|
|
|
|
for field_name, raw_value in extracted_fields.items():
|
|
try:
|
|
if "amount" in field_name.lower() or "total" in field_name.lower():
|
|
normalized[field_name] = _normalize_amount(raw_value)
|
|
elif "date" in field_name.lower():
|
|
normalized[field_name] = _normalize_date(raw_value)
|
|
elif "name" in field_name.lower():
|
|
normalized[field_name] = _normalize_name(raw_value)
|
|
elif "address" in field_name.lower():
|
|
normalized[field_name] = _normalize_address(raw_value)
|
|
elif "number" in field_name.lower():
|
|
normalized[field_name] = _normalize_number(raw_value)
|
|
else:
|
|
normalized[field_name] = _normalize_text(raw_value)
|
|
|
|
except Exception as e:
|
|
logger.warning(
|
|
"Failed to normalize field",
|
|
field=field_name,
|
|
value=raw_value,
|
|
error=str(e),
|
|
)
|
|
normalized[field_name] = raw_value # Keep original value
|
|
|
|
return normalized
|
|
|
|
|
|
def _normalize_amount(value: str) -> dict[str, Any]:
|
|
"""Normalize monetary amount"""
|
|
import re
|
|
|
|
if not value:
|
|
return {"amount": None, "currency": settings.currency_default}
|
|
|
|
# Remove currency symbols and formatting
|
|
clean_value = re.sub(r"[£$€,\s]", "", str(value))
|
|
|
|
try:
|
|
amount = Decimal(clean_value)
|
|
|
|
# Validate amount
|
|
if amount > settings.max_amount:
|
|
logger.warning("Amount exceeds maximum", amount=amount)
|
|
|
|
return {
|
|
"amount": float(amount),
|
|
"currency": settings.currency_default,
|
|
"original": value,
|
|
}
|
|
except Exception:
|
|
return {
|
|
"amount": None,
|
|
"currency": settings.currency_default,
|
|
"original": value,
|
|
}
|
|
|
|
|
|
def _normalize_date(value: str) -> dict[str, Any]:
|
|
"""Normalize date"""
|
|
from dateutil import parser
|
|
|
|
if not value:
|
|
return {"date": None, "original": value}
|
|
|
|
try:
|
|
# Try parsing with dateutil first
|
|
parsed_date = parser.parse(str(value), dayfirst=True)
|
|
return {"date": parsed_date.date().isoformat(), "original": value}
|
|
except Exception:
|
|
# Try manual formats
|
|
for fmt in settings.date_formats:
|
|
try:
|
|
parsed_date = datetime.strptime(str(value), fmt)
|
|
return {"date": parsed_date.date().isoformat(), "original": value}
|
|
except Exception:
|
|
continue
|
|
|
|
return {"date": None, "original": value}
|
|
|
|
|
|
def _normalize_name(value: str) -> dict[str, Any]:
|
|
"""Normalize person/company name"""
|
|
if not value:
|
|
return {"name": None, "original": value}
|
|
|
|
# Clean and title case
|
|
clean_name = str(value).strip().title()
|
|
|
|
# Detect if it's a company (contains Ltd, Limited, etc.)
|
|
company_indicators = ["Ltd", "Limited", "Plc", "Inc", "Corp", "Company"]
|
|
is_company = any(indicator in clean_name for indicator in company_indicators)
|
|
|
|
return {
|
|
"name": clean_name,
|
|
"type": "company" if is_company else "person",
|
|
"original": value,
|
|
}
|
|
|
|
|
|
def _normalize_address(value: str) -> dict[str, Any]:
|
|
"""Normalize address"""
|
|
import re
|
|
|
|
if not value:
|
|
return {"address": None, "original": value}
|
|
|
|
clean_address = str(value).strip()
|
|
|
|
# Extract UK postcode
|
|
postcode_pattern = r"\b[A-Z]{1,2}\d[A-Z0-9]?\s*\d[A-Z]{2}\b"
|
|
postcode_match = re.search(postcode_pattern, clean_address, re.IGNORECASE)
|
|
postcode = postcode_match.group().upper() if postcode_match else None
|
|
|
|
return {"address": clean_address, "postcode": postcode, "original": value}
|
|
|
|
|
|
def _normalize_number(value: str) -> dict[str, Any]:
|
|
"""Normalize reference numbers"""
|
|
import re
|
|
|
|
if not value:
|
|
return {"number": None, "original": value}
|
|
|
|
# Remove spaces and special characters
|
|
clean_number = re.sub(r"[^\w]", "", str(value))
|
|
|
|
# Detect number type
|
|
number_type = "unknown"
|
|
if len(clean_number) == 10 and clean_number.isdigit():
|
|
number_type = "utr" # UTR is 10 digits
|
|
elif len(clean_number) == 8 and clean_number.isdigit():
|
|
number_type = "account_number"
|
|
elif re.match(r"^\d{6}$", clean_number):
|
|
number_type = "sort_code"
|
|
|
|
return {"number": clean_number, "type": number_type, "original": value}
|
|
|
|
|
|
def _normalize_text(value: str) -> dict[str, Any]:
|
|
"""Normalize general text"""
|
|
if not value:
|
|
return {"text": None, "original": value}
|
|
|
|
clean_text = str(value).strip()
|
|
|
|
return {"text": clean_text, "original": value}
|
|
|
|
|
|
async def _map_to_entities(
|
|
normalized_data: dict[str, Any], doc_id: str, tenant_id: str
|
|
) -> list[dict[str, Any]]:
|
|
"""Map normalized data to knowledge graph entities"""
|
|
|
|
entities = []
|
|
|
|
# Create document entity
|
|
doc_entity = {
|
|
"type": "Document",
|
|
"id": doc_id,
|
|
"properties": {
|
|
"doc_id": doc_id,
|
|
"tenant_id": tenant_id,
|
|
"processed_at": datetime.utcnow().isoformat(),
|
|
"source": "extraction",
|
|
"extractor_version": "1.0.0",
|
|
"valid_from": datetime.utcnow(),
|
|
"asserted_at": datetime.utcnow(),
|
|
},
|
|
}
|
|
entities.append(doc_entity)
|
|
|
|
# Map specific field types to entities
|
|
for field_name, normalized_value in normalized_data.items():
|
|
if isinstance(normalized_value, dict):
|
|
if "amount" in normalized_value and normalized_value["amount"] is not None:
|
|
# Create expense or income item
|
|
entity_type = (
|
|
"ExpenseItem" if "expense" in field_name.lower() else "IncomeItem"
|
|
)
|
|
entity = {
|
|
"type": entity_type,
|
|
"id": f"{entity_type.lower()}_{ulid.new()}",
|
|
"properties": {
|
|
"amount": normalized_value["amount"],
|
|
"currency": normalized_value["currency"],
|
|
"description": field_name,
|
|
"source": doc_id,
|
|
"extractor_version": "1.0.0",
|
|
"valid_from": datetime.utcnow(),
|
|
"asserted_at": datetime.utcnow(),
|
|
},
|
|
}
|
|
entities.append(entity)
|
|
|
|
elif "name" in normalized_value and normalized_value["name"] is not None:
|
|
# Create party entity
|
|
entity = {
|
|
"type": "Party",
|
|
"id": f"party_{ulid.new()}",
|
|
"properties": {
|
|
"name": normalized_value["name"],
|
|
"party_type": normalized_value.get("type", "unknown"),
|
|
"source": doc_id,
|
|
"extractor_version": "1.0.0",
|
|
"valid_from": datetime.utcnow(),
|
|
"asserted_at": datetime.utcnow(),
|
|
},
|
|
}
|
|
entities.append(entity)
|
|
|
|
return entities
|
|
|
|
|
|
async def _store_entities(
|
|
entities: list[dict[str, Any]], tenant_id: str
|
|
) -> list[dict[str, Any]]:
|
|
"""Store entities in knowledge graph"""
|
|
|
|
stored_entities = []
|
|
|
|
for entity in entities:
|
|
try:
|
|
# Create node in Neo4j
|
|
result = await neo4j_client.create_node(
|
|
label=entity["type"], properties=entity["properties"]
|
|
)
|
|
|
|
stored_entities.append(
|
|
{
|
|
"type": entity["type"],
|
|
"id": entity["id"],
|
|
"neo4j_id": result.get("id"),
|
|
"properties": entity["properties"],
|
|
}
|
|
)
|
|
|
|
logger.debug("Entity stored", type=entity["type"], id=entity["id"])
|
|
|
|
except Exception as e:
|
|
logger.error("Failed to store entity", entity=entity, error=str(e))
|
|
|
|
return stored_entities
|
|
|
|
|
|
|
|
@app.exception_handler(HTTPException)
|
|
async def http_exception_handler(request: Request, exc: HTTPException) -> JSONResponse:
|
|
"""Handle HTTP exceptions with RFC7807 format"""
|
|
return JSONResponse(
|
|
status_code=exc.status_code,
|
|
content=ErrorResponse(
|
|
type=f"https://httpstatuses.com/{exc.status_code}",
|
|
title=exc.detail,
|
|
status=exc.status_code,
|
|
detail=exc.detail,
|
|
instance=str(request.url),
|
|
trace_id="",
|
|
).dict(),
|
|
)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import uvicorn
|
|
|
|
uvicorn.run("main:app", host="0.0.0.0", port=8004, reload=True, log_config=None)
|