completed local setup with compose
Some checks failed
CI/CD Pipeline / Generate SBOM (push) Has been cancelled
CI/CD Pipeline / Deploy to Staging (push) Has been cancelled
CI/CD Pipeline / Deploy to Production (push) Has been cancelled
CI/CD Pipeline / Code Quality & Linting (push) Has been cancelled
CI/CD Pipeline / Policy Validation (push) Has been cancelled
CI/CD Pipeline / Test Suite (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-firm-connectors) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-forms) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-hmrc) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ingestion) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-normalize-map) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ocr) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-indexer) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-reason) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rpa) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (ui-review) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (ui-review) (push) Has been cancelled
CI/CD Pipeline / Notifications (push) Has been cancelled
Some checks failed
CI/CD Pipeline / Generate SBOM (push) Has been cancelled
CI/CD Pipeline / Deploy to Staging (push) Has been cancelled
CI/CD Pipeline / Deploy to Production (push) Has been cancelled
CI/CD Pipeline / Code Quality & Linting (push) Has been cancelled
CI/CD Pipeline / Policy Validation (push) Has been cancelled
CI/CD Pipeline / Test Suite (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-firm-connectors) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-forms) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-hmrc) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ingestion) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-normalize-map) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ocr) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-indexer) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-reason) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rpa) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (ui-review) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (ui-review) (push) Has been cancelled
CI/CD Pipeline / Notifications (push) Has been cancelled
This commit is contained in:
@@ -1,53 +1,27 @@
|
||||
# Multi-stage build for svc_normalize_map
|
||||
FROM python:3.12-slim AS builder
|
||||
FROM python:3.12-slim-bookworm
|
||||
|
||||
# Install build dependencies
|
||||
RUN apt-get update && apt-get install -y \
|
||||
build-essential \
|
||||
curl \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
# Set environment variables
|
||||
ENV PYTHONUNBUFFERED 1
|
||||
ENV APP_HOME /app
|
||||
|
||||
# Create virtual environment
|
||||
RUN python -m venv /opt/venv
|
||||
ENV PATH="/opt/venv/bin:$PATH"
|
||||
# Create and set working directory
|
||||
WORKDIR $APP_HOME
|
||||
|
||||
# Copy requirements and install dependencies
|
||||
# Install dependencies
|
||||
COPY libs/requirements-base.txt /tmp/libs-requirements.txt
|
||||
COPY apps/svc_normalize_map/requirements.txt /tmp/requirements.txt
|
||||
RUN pip install --no-cache-dir --upgrade pip && \
|
||||
pip install --no-cache-dir -r /tmp/libs-requirements.txt -r /tmp/requirements.txt
|
||||
|
||||
# Production stage
|
||||
FROM python:3.12-slim
|
||||
|
||||
# Install runtime dependencies
|
||||
RUN apt-get update && apt-get install -y \
|
||||
curl \
|
||||
&& rm -rf /var/lib/apt/lists/* \
|
||||
&& groupadd -r appuser \
|
||||
&& useradd -r -g appuser appuser
|
||||
|
||||
# Copy virtual environment from builder
|
||||
COPY --from=builder /opt/venv /opt/venv
|
||||
ENV PATH="/opt/venv/bin:$PATH"
|
||||
|
||||
# Set working directory
|
||||
WORKDIR /app
|
||||
RUN pip install --no-cache-dir -r /tmp/libs-requirements.txt -r /tmp/requirements.txt
|
||||
|
||||
# Copy application code
|
||||
COPY libs/ ./libs/
|
||||
COPY apps/svc_normalize_map/ ./apps/svc_normalize_map/
|
||||
|
||||
# Create non-root user and set permissions
|
||||
RUN chown -R appuser:appuser /app
|
||||
USER appuser
|
||||
|
||||
# Health check
|
||||
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
|
||||
CMD curl -f http://localhost:8000/healthz || exit 1
|
||||
|
||||
# Expose port
|
||||
|
||||
EXPOSE 8000
|
||||
|
||||
|
||||
|
||||
# Run the application
|
||||
|
||||
CMD ["python", "-m", "uvicorn", "apps.svc_normalize_map.main:app", "--host", "0.0.0.0", "--port", "8000"]
|
||||
|
||||
@@ -1,24 +1,11 @@
|
||||
"""Data normalization and knowledge graph mapping."""
|
||||
|
||||
# FILE: apps/svc-normalize-map/main.py
|
||||
# pylint: disable=wrong-import-position,import-error,too-few-public-methods,global-statement
|
||||
# pylint: disable=global-variable-not-assigned,raise-missing-from,unused-argument
|
||||
# pylint: disable=broad-exception-caught,no-else-return,too-many-arguments,too-many-positional-arguments
|
||||
# pylint: disable=too-many-locals,import-outside-toplevel,too-many-statements
|
||||
# mypy: disable-error-code=union-attr
|
||||
|
||||
|
||||
import os
|
||||
|
||||
# Import shared libraries
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from decimal import Decimal
|
||||
from typing import Any
|
||||
from datetime import UTC, datetime
|
||||
from typing import Any, cast
|
||||
|
||||
import structlog
|
||||
import ulid
|
||||
from fastapi import BackgroundTasks, Depends, HTTPException, Request
|
||||
from fastapi import HTTPException, Request
|
||||
from fastapi.responses import JSONResponse
|
||||
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
|
||||
@@ -34,83 +21,68 @@ from libs.events import EventBus, EventPayload, EventTopics
|
||||
from libs.neo import Neo4jClient
|
||||
from libs.observability import get_metrics, get_tracer, setup_observability
|
||||
from libs.schemas import ErrorResponse
|
||||
from libs.security import get_current_user, get_tenant_id
|
||||
from libs.storage import DocumentStorage, StorageClient
|
||||
|
||||
logger = structlog.get_logger()
|
||||
|
||||
|
||||
class NormalizeMapSettings(BaseAppSettings):
|
||||
"""Settings for normalize-map service"""
|
||||
"""Settings for NormalizeMap service"""
|
||||
|
||||
service_name: str = "svc-normalize-map"
|
||||
|
||||
# Normalization configuration
|
||||
currency_default: str = "GBP"
|
||||
date_formats: list[str] = [
|
||||
"%Y-%m-%d",
|
||||
"%d/%m/%Y",
|
||||
"%d-%m-%Y",
|
||||
"%d %B %Y",
|
||||
"%d %b %Y",
|
||||
"%B %d, %Y",
|
||||
]
|
||||
|
||||
# Mapping configuration
|
||||
confidence_threshold: float = 0.7
|
||||
auto_create_entities: bool = True
|
||||
|
||||
# Validation rules
|
||||
max_amount: float = 1000000.0 # £1M
|
||||
min_confidence: float = 0.5
|
||||
|
||||
|
||||
# Create app and settings
|
||||
app, settings = create_app(
|
||||
service_name="svc-normalize-map",
|
||||
title="Tax Agent Normalize-Map Service",
|
||||
description="Data normalization and knowledge graph mapping service",
|
||||
settings_class=NormalizeMapSettings,
|
||||
)
|
||||
|
||||
# Global clients
|
||||
storage_client: StorageClient | None = None
|
||||
document_storage: DocumentStorage | None = None
|
||||
neo4j_client: Neo4jClient | None = None
|
||||
event_bus: EventBus | None = None
|
||||
tracer = get_tracer("svc-normalize-map")
|
||||
metrics = get_metrics()
|
||||
neo4j_client: Neo4jClient | None = None
|
||||
|
||||
settings: NormalizeMapSettings
|
||||
|
||||
|
||||
@app.on_event("startup")
|
||||
async def startup_event() -> None:
|
||||
async def init_dependencies(app_settings: NormalizeMapSettings) -> None:
|
||||
"""Initialize service dependencies"""
|
||||
global storage_client, document_storage, neo4j_client, event_bus
|
||||
global storage_client, document_storage, event_bus, neo4j_client, settings
|
||||
|
||||
logger.info("Starting normalize-map service")
|
||||
settings = app_settings
|
||||
logger.info("Starting NormalizeMap service")
|
||||
|
||||
# Setup observability
|
||||
setup_observability(settings)
|
||||
|
||||
# Initialize MinIO client
|
||||
minio_client = create_minio_client(settings)
|
||||
storage_client = StorageClient(minio_client)
|
||||
document_storage = DocumentStorage(storage_client)
|
||||
|
||||
# Initialize Neo4j client
|
||||
neo4j_driver = create_neo4j_client(settings)
|
||||
neo4j_client = Neo4jClient(neo4j_driver)
|
||||
|
||||
# Initialize event bus
|
||||
event_bus = create_event_bus(settings)
|
||||
if not event_bus:
|
||||
raise HTTPException(status_code=500, detail="Event bus not initialized")
|
||||
await event_bus.start()
|
||||
|
||||
# Subscribe to extraction completion events
|
||||
await event_bus.subscribe( # type: ignore
|
||||
EventTopics.DOC_EXTRACTED, _handle_extraction_completed
|
||||
)
|
||||
await event_bus.subscribe(EventTopics.DOC_EXTRACTED, _handle_document_extracted)
|
||||
|
||||
logger.info("Normalize-map service started successfully")
|
||||
logger.info("NormalizeMap service started successfully")
|
||||
|
||||
|
||||
app, _settings = create_app(
|
||||
service_name="svc-normalize-map",
|
||||
title="Tax Agent Normalize and Map Service",
|
||||
description="Normalize extracted data and map to Knowledge Graph",
|
||||
settings_class=NormalizeMapSettings,
|
||||
)
|
||||
|
||||
|
||||
# Initialize dependencies immediately
|
||||
@app.on_event("startup")
|
||||
async def startup_event(): # type: ignore
|
||||
await init_dependencies(cast(NormalizeMapSettings, _settings))
|
||||
|
||||
|
||||
tracer = get_tracer("svc-normalize-map")
|
||||
metrics = get_metrics()
|
||||
|
||||
|
||||
@app.on_event("shutdown")
|
||||
@@ -118,456 +90,235 @@ async def shutdown_event() -> None:
|
||||
"""Cleanup service dependencies"""
|
||||
global event_bus, neo4j_client
|
||||
|
||||
logger.info("Shutting down normalize-map service")
|
||||
|
||||
if neo4j_client:
|
||||
await neo4j_client.close()
|
||||
|
||||
logger.info("Shutting down NormalizeMap service")
|
||||
if event_bus:
|
||||
await event_bus.stop()
|
||||
|
||||
logger.info("Normalize-map service shutdown complete")
|
||||
if neo4j_client:
|
||||
await neo4j_client.close()
|
||||
logger.info("NormalizeMap service shutdown complete")
|
||||
|
||||
|
||||
@app.get("/health")
|
||||
async def health_check() -> dict[str, Any]:
|
||||
"""Health check endpoint"""
|
||||
return {
|
||||
"status": "healthy",
|
||||
"service": settings.service_name,
|
||||
"version": settings.service_version,
|
||||
"timestamp": datetime.utcnow().isoformat(),
|
||||
}
|
||||
async def _handle_document_extracted(topic: str, payload: EventPayload) -> None:
|
||||
"""Handle document extracted events"""
|
||||
data = payload.data
|
||||
doc_id = data.get("doc_id")
|
||||
tenant_id = data.get("tenant_id")
|
||||
extracted_fields = data.get("extraction_results", {}).get("extracted_fields", {})
|
||||
provenance = data.get("extraction_results", {}).get("provenance", [])
|
||||
|
||||
if not doc_id or not tenant_id or not extracted_fields:
|
||||
logger.warning("Invalid document extracted event", data=data)
|
||||
return
|
||||
|
||||
@app.post("/normalize/{doc_id}")
|
||||
async def normalize_document(
|
||||
doc_id: str,
|
||||
background_tasks: BackgroundTasks,
|
||||
current_user: dict[str, Any] = Depends(get_current_user),
|
||||
tenant_id: str = Depends(get_tenant_id),
|
||||
) -> dict[str, Any]:
|
||||
"""Normalize and map document data to knowledge graph"""
|
||||
|
||||
with tracer.start_as_current_span("normalize_document") as span:
|
||||
with tracer.start_as_current_span("normalize_and_map") as span:
|
||||
span.set_attribute("doc_id", doc_id)
|
||||
span.set_attribute("tenant_id", tenant_id)
|
||||
|
||||
try:
|
||||
# Check if extraction results exist
|
||||
extraction_results = await document_storage.get_extraction_result(
|
||||
tenant_id, doc_id
|
||||
)
|
||||
if not extraction_results:
|
||||
raise HTTPException(
|
||||
status_code=404, detail="Extraction results not found"
|
||||
)
|
||||
# 1. Normalize data
|
||||
normalized_data = await _normalize_data(extracted_fields)
|
||||
|
||||
# Generate normalization ID
|
||||
normalization_id = str(ulid.new())
|
||||
span.set_attribute("normalization_id", normalization_id)
|
||||
|
||||
# Start background normalization
|
||||
background_tasks.add_task(
|
||||
_normalize_and_map_async,
|
||||
doc_id,
|
||||
tenant_id,
|
||||
extraction_results,
|
||||
normalization_id,
|
||||
current_user.get("sub", "system"),
|
||||
# 2. Map to KG ontology
|
||||
kg_upsert_payload = await _map_to_kg_ontology(
|
||||
doc_id, tenant_id, normalized_data, provenance
|
||||
)
|
||||
|
||||
logger.info(
|
||||
"Normalization started",
|
||||
doc_id=doc_id,
|
||||
normalization_id=normalization_id,
|
||||
# 3. Publish kg.upsert.ready event
|
||||
event_payload = EventPayload(
|
||||
data=kg_upsert_payload,
|
||||
actor=payload.actor,
|
||||
tenant_id=tenant_id,
|
||||
trace_id=str(span.get_span_context().trace_id),
|
||||
)
|
||||
await event_bus.publish(EventTopics.KG_UPSERT_READY, event_payload) # type: ignore
|
||||
|
||||
return {
|
||||
"normalization_id": normalization_id,
|
||||
"doc_id": doc_id,
|
||||
"status": "processing",
|
||||
}
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error("Failed to start normalization", doc_id=doc_id, error=str(e))
|
||||
raise HTTPException(status_code=500, detail="Failed to start normalization")
|
||||
|
||||
|
||||
async def _handle_extraction_completed(topic: str, payload: EventPayload) -> None:
|
||||
"""Handle extraction completion events"""
|
||||
try:
|
||||
data = payload.data
|
||||
doc_id = data.get("doc_id")
|
||||
tenant_id = data.get("tenant_id")
|
||||
confidence = data.get("confidence", 0.0)
|
||||
|
||||
if not doc_id or not tenant_id:
|
||||
logger.warning("Invalid extraction completion event", data=data)
|
||||
return
|
||||
|
||||
# Only auto-process if confidence is above threshold
|
||||
if confidence >= settings.confidence_threshold:
|
||||
logger.info(
|
||||
"Auto-normalizing extracted document",
|
||||
doc_id=doc_id,
|
||||
confidence=confidence,
|
||||
)
|
||||
|
||||
extraction_results = data.get("extraction_results")
|
||||
if not extraction_results:
|
||||
extraction_results = await document_storage.get_extraction_result(
|
||||
tenant_id, doc_id
|
||||
)
|
||||
|
||||
if extraction_results:
|
||||
await _normalize_and_map_async(
|
||||
doc_id=doc_id,
|
||||
tenant_id=tenant_id,
|
||||
extraction_results=extraction_results,
|
||||
normalization_id=str(ulid.new()),
|
||||
actor=payload.actor,
|
||||
)
|
||||
else:
|
||||
logger.info(
|
||||
"Skipping auto-normalization due to low confidence",
|
||||
doc_id=doc_id,
|
||||
confidence=confidence,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Failed to handle extraction completion", error=str(e))
|
||||
|
||||
|
||||
async def _normalize_and_map_async(
|
||||
doc_id: str,
|
||||
tenant_id: str,
|
||||
extraction_results: dict[str, Any],
|
||||
normalization_id: str,
|
||||
actor: str,
|
||||
) -> None:
|
||||
"""Normalize and map data asynchronously"""
|
||||
|
||||
with tracer.start_as_current_span("normalize_and_map_async") as span:
|
||||
span.set_attribute("doc_id", doc_id)
|
||||
span.set_attribute("normalization_id", normalization_id)
|
||||
|
||||
try:
|
||||
extracted_fields = extraction_results.get("extracted_fields", {})
|
||||
provenance = extraction_results.get("provenance", [])
|
||||
|
||||
# Normalize extracted data
|
||||
normalized_data = await _normalize_data(extracted_fields, provenance)
|
||||
|
||||
# Map to knowledge graph entities
|
||||
entities = await _map_to_entities(normalized_data, doc_id, tenant_id)
|
||||
|
||||
# Store entities in knowledge graph
|
||||
stored_entities = await _store_entities(entities, tenant_id)
|
||||
|
||||
# Create normalization results
|
||||
normalization_results = {
|
||||
"doc_id": doc_id,
|
||||
"normalization_id": normalization_id,
|
||||
"normalized_at": datetime.utcnow().isoformat(),
|
||||
"normalized_data": normalized_data,
|
||||
"entities": stored_entities,
|
||||
"entity_count": len(stored_entities),
|
||||
}
|
||||
|
||||
logger.info("Normalization completed", results=normalization_results)
|
||||
|
||||
# Update metrics
|
||||
metrics.counter("documents_normalized_total").labels(
|
||||
metrics.counter("normalized_documents_total").labels(
|
||||
tenant_id=tenant_id
|
||||
).inc()
|
||||
|
||||
metrics.histogram("entities_created").labels(tenant_id=tenant_id).observe(
|
||||
len(stored_entities)
|
||||
)
|
||||
|
||||
# Publish completion event
|
||||
event_payload = EventPayload(
|
||||
data={
|
||||
"doc_id": doc_id,
|
||||
"tenant_id": tenant_id,
|
||||
"normalization_id": normalization_id,
|
||||
"entity_count": len(stored_entities),
|
||||
"entities": stored_entities,
|
||||
},
|
||||
actor=actor,
|
||||
tenant_id=tenant_id,
|
||||
)
|
||||
|
||||
await event_bus.publish(EventTopics.KG_UPSERTED, event_payload)
|
||||
|
||||
logger.info(
|
||||
"Normalization completed", doc_id=doc_id, entities=len(stored_entities)
|
||||
"Document normalized and mapped", doc_id=doc_id, tenant_id=tenant_id
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Normalization failed", doc_id=doc_id, error=str(e))
|
||||
|
||||
# Update error metrics
|
||||
logger.error(
|
||||
"Failed to normalize and map document", doc_id=doc_id, error=str(e)
|
||||
)
|
||||
metrics.counter("normalization_errors_total").labels(
|
||||
tenant_id=tenant_id, error_type=type(e).__name__
|
||||
).inc()
|
||||
|
||||
|
||||
async def _normalize_data(
|
||||
extracted_fields: dict[str, Any], provenance: list[dict[str, Any]]
|
||||
) -> dict[str, Any]:
|
||||
"""Normalize extracted data"""
|
||||
|
||||
normalized = {}
|
||||
|
||||
for field_name, raw_value in extracted_fields.items():
|
||||
try:
|
||||
if "amount" in field_name.lower() or "total" in field_name.lower():
|
||||
normalized[field_name] = _normalize_amount(raw_value)
|
||||
elif "date" in field_name.lower():
|
||||
normalized[field_name] = _normalize_date(raw_value)
|
||||
elif "name" in field_name.lower():
|
||||
normalized[field_name] = _normalize_name(raw_value)
|
||||
elif "address" in field_name.lower():
|
||||
normalized[field_name] = _normalize_address(raw_value)
|
||||
elif "number" in field_name.lower():
|
||||
normalized[field_name] = _normalize_number(raw_value)
|
||||
else:
|
||||
normalized[field_name] = _normalize_text(raw_value)
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
"Failed to normalize field",
|
||||
field=field_name,
|
||||
value=raw_value,
|
||||
error=str(e),
|
||||
)
|
||||
normalized[field_name] = raw_value # Keep original value
|
||||
|
||||
return normalized
|
||||
|
||||
|
||||
def _normalize_amount(value: str) -> dict[str, Any]:
|
||||
"""Normalize monetary amount"""
|
||||
import re
|
||||
|
||||
if not value:
|
||||
return {"amount": None, "currency": settings.currency_default}
|
||||
|
||||
# Remove currency symbols and formatting
|
||||
clean_value = re.sub(r"[£$€,\s]", "", str(value))
|
||||
|
||||
try:
|
||||
amount = Decimal(clean_value)
|
||||
|
||||
# Validate amount
|
||||
if amount > settings.max_amount:
|
||||
logger.warning("Amount exceeds maximum", amount=amount)
|
||||
|
||||
return {
|
||||
"amount": float(amount),
|
||||
"currency": settings.currency_default,
|
||||
"original": value,
|
||||
}
|
||||
except Exception:
|
||||
return {
|
||||
"amount": None,
|
||||
"currency": settings.currency_default,
|
||||
"original": value,
|
||||
}
|
||||
|
||||
|
||||
def _normalize_date(value: str) -> dict[str, Any]:
|
||||
"""Normalize date"""
|
||||
from dateutil import parser
|
||||
|
||||
if not value:
|
||||
return {"date": None, "original": value}
|
||||
|
||||
try:
|
||||
# Try parsing with dateutil first
|
||||
parsed_date = parser.parse(str(value), dayfirst=True)
|
||||
return {"date": parsed_date.date().isoformat(), "original": value}
|
||||
except Exception:
|
||||
# Try manual formats
|
||||
for fmt in settings.date_formats:
|
||||
async def _normalize_data(extracted_fields: dict[str, Any]) -> dict[str, Any]:
|
||||
"""Normalize extracted data into a consistent format"""
|
||||
normalized_data = {}
|
||||
for key, value in extracted_fields.items():
|
||||
# Example: Simple date normalization (can be expanded)
|
||||
if "date" in key.lower() and isinstance(value, str):
|
||||
try:
|
||||
parsed_date = datetime.strptime(str(value), fmt)
|
||||
return {"date": parsed_date.date().isoformat(), "original": value}
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
return {"date": None, "original": value}
|
||||
# Attempt to parse various date formats
|
||||
# Add more robust date parsing logic here as needed
|
||||
normalized_data[key] = datetime.fromisoformat(value).date().isoformat()
|
||||
except ValueError:
|
||||
normalized_data[key] = value # Keep original if parsing fails
|
||||
elif "amount" in key.lower() and isinstance(value, str):
|
||||
# Example: Normalize currency to a Decimal
|
||||
try:
|
||||
normalized_data[key] = float(value.replace("£", "").replace(",", ""))
|
||||
except ValueError:
|
||||
normalized_data[key] = value
|
||||
else:
|
||||
normalized_data[key] = value
|
||||
return normalized_data
|
||||
|
||||
|
||||
def _normalize_name(value: str) -> dict[str, Any]:
|
||||
"""Normalize person/company name"""
|
||||
if not value:
|
||||
return {"name": None, "original": value}
|
||||
async def _map_to_kg_ontology(
|
||||
doc_id: str,
|
||||
tenant_id: str,
|
||||
normalized_data: dict[str, Any],
|
||||
provenance: list[dict[str, Any]],
|
||||
) -> dict[str, Any]:
|
||||
"""Map normalized data to Knowledge Graph ontology nodes and relationships based on kg_schema.json"""
|
||||
nodes = []
|
||||
relationships = []
|
||||
now = datetime.now(UTC).isoformat()
|
||||
|
||||
# Clean and title case
|
||||
clean_name = str(value).strip().title()
|
||||
# Create a Document node
|
||||
doc_node_id = f"document_{doc_id}"
|
||||
nodes.append(
|
||||
{
|
||||
"id": doc_node_id,
|
||||
"type": "Document",
|
||||
"properties": {
|
||||
"node_type": "Document",
|
||||
"doc_id": doc_id,
|
||||
"kind": normalized_data.get("kind", "OtherSupportingDoc"),
|
||||
"source": normalized_data.get("source", "manual_upload"),
|
||||
"checksum": normalized_data.get("checksum", ""),
|
||||
"valid_from": now,
|
||||
"asserted_at": now,
|
||||
# "source": "svc-normalize-map",
|
||||
"extractor_version": "1.0.0",
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
# Detect if it's a company (contains Ltd, Limited, etc.)
|
||||
company_indicators = ["Ltd", "Limited", "Plc", "Inc", "Corp", "Company"]
|
||||
is_company = any(indicator in clean_name for indicator in company_indicators)
|
||||
# Create a TaxpayerProfile node
|
||||
taxpayer_id = normalized_data.get("taxpayer_id", "unknown_taxpayer")
|
||||
taxpayer_node_id = f"taxpayer_{taxpayer_id}"
|
||||
nodes.append(
|
||||
{
|
||||
"id": taxpayer_node_id,
|
||||
"type": "TaxpayerProfile",
|
||||
"properties": {
|
||||
"node_type": "TaxpayerProfile",
|
||||
"taxpayer_id": taxpayer_id,
|
||||
"type": "Individual",
|
||||
"valid_from": now,
|
||||
"asserted_at": now,
|
||||
"source": "svc-normalize-map",
|
||||
"extractor_version": "1.0.0",
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
relationships.append(
|
||||
{
|
||||
"id": f"rel_document_to_taxpayer_{doc_id}",
|
||||
"type": "BELONGS_TO",
|
||||
"sourceId": doc_node_id,
|
||||
"targetId": taxpayer_node_id,
|
||||
"properties": {},
|
||||
}
|
||||
)
|
||||
|
||||
# Create IncomeItem/ExpenseItem nodes and Evidence nodes
|
||||
item_type = (
|
||||
"IncomeItem" if normalized_data.get("kind") == "invoice" else "ExpenseItem"
|
||||
)
|
||||
|
||||
for field, value in normalized_data.items():
|
||||
if field in ["total_amount", "net_amount", "vat_amount", "amount"]:
|
||||
item_id = f"item_{ulid.new()}"
|
||||
item_node_id = f"{item_type.lower()}_{item_id}"
|
||||
|
||||
# Create the financial item node (IncomeItem or ExpenseItem)
|
||||
nodes.append(
|
||||
{
|
||||
"id": item_node_id,
|
||||
"type": item_type,
|
||||
"properties": {
|
||||
"node_type": item_type,
|
||||
"type": (
|
||||
"self_employment"
|
||||
if "invoice" in normalized_data.get("kind", "")
|
||||
else "other"
|
||||
),
|
||||
"gross": value,
|
||||
"currency": "GBP",
|
||||
"description": normalized_data.get("description", field),
|
||||
"valid_from": now,
|
||||
"asserted_at": now,
|
||||
"source": "svc-normalize-map",
|
||||
"extractor_version": "1.0.0",
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
relationships.append(
|
||||
{
|
||||
"id": f"rel_taxpayer_has_{item_type.lower()}_{item_id}",
|
||||
"type": (
|
||||
"HAS_INCOME" if item_type == "IncomeItem" else "HAS_EXPENSE"
|
||||
),
|
||||
"sourceId": taxpayer_node_id,
|
||||
"targetId": item_node_id,
|
||||
"properties": {},
|
||||
}
|
||||
)
|
||||
|
||||
# Create an Evidence node linking the item to the document
|
||||
prov = next((p for p in provenance if p["field"] == field), None)
|
||||
if prov:
|
||||
evidence_id = f"evidence_{item_id}"
|
||||
nodes.append(
|
||||
{
|
||||
"id": evidence_id,
|
||||
"type": "Evidence",
|
||||
"properties": {
|
||||
"node_type": "Evidence",
|
||||
"snippet_id": evidence_id,
|
||||
"doc_ref": doc_id,
|
||||
"page": prov.get("page"),
|
||||
"bbox": prov.get("bbox"),
|
||||
"text_hash": "dummy_hash", # Placeholder
|
||||
"ocr_confidence": prov.get("confidence"),
|
||||
"extracted_text": str(value),
|
||||
"valid_from": now,
|
||||
"asserted_at": now,
|
||||
"source": "svc-normalize-map",
|
||||
"extractor_version": "1.0.0",
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
relationships.append(
|
||||
{
|
||||
"id": f"rel_item_supported_by_evidence_{item_id}",
|
||||
"type": "SUPPORTED_BY",
|
||||
"sourceId": item_node_id,
|
||||
"targetId": evidence_id,
|
||||
"properties": {},
|
||||
}
|
||||
)
|
||||
|
||||
return {
|
||||
"name": clean_name,
|
||||
"type": "company" if is_company else "person",
|
||||
"original": value,
|
||||
"nodes": nodes,
|
||||
"relationships": relationships,
|
||||
"document_id": doc_id,
|
||||
"tenant_id": tenant_id,
|
||||
}
|
||||
|
||||
|
||||
def _normalize_address(value: str) -> dict[str, Any]:
|
||||
"""Normalize address"""
|
||||
import re
|
||||
|
||||
if not value:
|
||||
return {"address": None, "original": value}
|
||||
|
||||
clean_address = str(value).strip()
|
||||
|
||||
# Extract UK postcode
|
||||
postcode_pattern = r"\b[A-Z]{1,2}\d[A-Z0-9]?\s*\d[A-Z]{2}\b"
|
||||
postcode_match = re.search(postcode_pattern, clean_address, re.IGNORECASE)
|
||||
postcode = postcode_match.group().upper() if postcode_match else None
|
||||
|
||||
return {"address": clean_address, "postcode": postcode, "original": value}
|
||||
|
||||
|
||||
def _normalize_number(value: str) -> dict[str, Any]:
|
||||
"""Normalize reference numbers"""
|
||||
import re
|
||||
|
||||
if not value:
|
||||
return {"number": None, "original": value}
|
||||
|
||||
# Remove spaces and special characters
|
||||
clean_number = re.sub(r"[^\w]", "", str(value))
|
||||
|
||||
# Detect number type
|
||||
number_type = "unknown"
|
||||
if len(clean_number) == 10 and clean_number.isdigit():
|
||||
number_type = "utr" # UTR is 10 digits
|
||||
elif len(clean_number) == 8 and clean_number.isdigit():
|
||||
number_type = "account_number"
|
||||
elif re.match(r"^\d{6}$", clean_number):
|
||||
number_type = "sort_code"
|
||||
|
||||
return {"number": clean_number, "type": number_type, "original": value}
|
||||
|
||||
|
||||
def _normalize_text(value: str) -> dict[str, Any]:
|
||||
"""Normalize general text"""
|
||||
if not value:
|
||||
return {"text": None, "original": value}
|
||||
|
||||
clean_text = str(value).strip()
|
||||
|
||||
return {"text": clean_text, "original": value}
|
||||
|
||||
|
||||
async def _map_to_entities(
|
||||
normalized_data: dict[str, Any], doc_id: str, tenant_id: str
|
||||
) -> list[dict[str, Any]]:
|
||||
"""Map normalized data to knowledge graph entities"""
|
||||
|
||||
entities = []
|
||||
|
||||
# Create document entity
|
||||
doc_entity = {
|
||||
"type": "Document",
|
||||
"id": doc_id,
|
||||
"properties": {
|
||||
"doc_id": doc_id,
|
||||
"tenant_id": tenant_id,
|
||||
"processed_at": datetime.utcnow().isoformat(),
|
||||
"source": "extraction",
|
||||
"extractor_version": "1.0.0",
|
||||
"valid_from": datetime.utcnow(),
|
||||
"asserted_at": datetime.utcnow(),
|
||||
},
|
||||
}
|
||||
entities.append(doc_entity)
|
||||
|
||||
# Map specific field types to entities
|
||||
for field_name, normalized_value in normalized_data.items():
|
||||
if isinstance(normalized_value, dict):
|
||||
if "amount" in normalized_value and normalized_value["amount"] is not None:
|
||||
# Create expense or income item
|
||||
entity_type = (
|
||||
"ExpenseItem" if "expense" in field_name.lower() else "IncomeItem"
|
||||
)
|
||||
entity = {
|
||||
"type": entity_type,
|
||||
"id": f"{entity_type.lower()}_{ulid.new()}",
|
||||
"properties": {
|
||||
"amount": normalized_value["amount"],
|
||||
"currency": normalized_value["currency"],
|
||||
"description": field_name,
|
||||
"source": doc_id,
|
||||
"extractor_version": "1.0.0",
|
||||
"valid_from": datetime.utcnow(),
|
||||
"asserted_at": datetime.utcnow(),
|
||||
},
|
||||
}
|
||||
entities.append(entity)
|
||||
|
||||
elif "name" in normalized_value and normalized_value["name"] is not None:
|
||||
# Create party entity
|
||||
entity = {
|
||||
"type": "Party",
|
||||
"id": f"party_{ulid.new()}",
|
||||
"properties": {
|
||||
"name": normalized_value["name"],
|
||||
"party_type": normalized_value.get("type", "unknown"),
|
||||
"source": doc_id,
|
||||
"extractor_version": "1.0.0",
|
||||
"valid_from": datetime.utcnow(),
|
||||
"asserted_at": datetime.utcnow(),
|
||||
},
|
||||
}
|
||||
entities.append(entity)
|
||||
|
||||
return entities
|
||||
|
||||
|
||||
async def _store_entities(
|
||||
entities: list[dict[str, Any]], tenant_id: str
|
||||
) -> list[dict[str, Any]]:
|
||||
"""Store entities in knowledge graph"""
|
||||
|
||||
stored_entities = []
|
||||
|
||||
for entity in entities:
|
||||
try:
|
||||
# Create node in Neo4j
|
||||
result = await neo4j_client.create_node(
|
||||
label=entity["type"], properties=entity["properties"]
|
||||
)
|
||||
|
||||
stored_entities.append(
|
||||
{
|
||||
"type": entity["type"],
|
||||
"id": entity["id"],
|
||||
"neo4j_id": result.get("id"),
|
||||
"properties": entity["properties"],
|
||||
}
|
||||
)
|
||||
|
||||
logger.debug("Entity stored", type=entity["type"], id=entity["id"])
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Failed to store entity", entity=entity, error=str(e))
|
||||
|
||||
return stored_entities
|
||||
|
||||
|
||||
|
||||
@app.exception_handler(HTTPException)
|
||||
async def http_exception_handler(request: Request, exc: HTTPException) -> JSONResponse:
|
||||
"""Handle HTTP exceptions with RFC7807 format"""
|
||||
@@ -579,8 +330,8 @@ async def http_exception_handler(request: Request, exc: HTTPException) -> JSONRe
|
||||
status=exc.status_code,
|
||||
detail=exc.detail,
|
||||
instance=str(request.url),
|
||||
trace_id="",
|
||||
).dict(),
|
||||
trace_id=getattr(request.state, "trace_id", None),
|
||||
).model_dump(),
|
||||
)
|
||||
|
||||
|
||||
|
||||
@@ -1,37 +1 @@
|
||||
# FastAPI and server
|
||||
fastapi>=0.118.3
|
||||
uvicorn[standard]>=0.37.0
|
||||
pydantic>=2.12.0
|
||||
|
||||
# Service-specific dependencies
|
||||
# Data normalization and cleaning
|
||||
pandas>=2.3.3
|
||||
numpy>=2.3.3
|
||||
|
||||
# Currency and exchange rates
|
||||
forex-python>=1.9.2
|
||||
babel>=2.17.0
|
||||
|
||||
# Date and time processing
|
||||
python-dateutil>=2.9.0
|
||||
pytz>=2025.2
|
||||
|
||||
# Text normalization
|
||||
unidecode>=1.4.0
|
||||
phonenumbers>=9.0.16
|
||||
|
||||
# Entity resolution and matching
|
||||
recordlinkage>=0.16.0
|
||||
fuzzywuzzy>=0.18.0
|
||||
python-Levenshtein>=0.27.1
|
||||
|
||||
# Geographic data
|
||||
geopy>=2.4.1
|
||||
pycountry>=24.6.1
|
||||
|
||||
# Data validation
|
||||
cerberus>=1.3.7
|
||||
marshmallow>=4.0.1
|
||||
|
||||
# UK-specific utilities
|
||||
uk-postcode-utils>=1.1
|
||||
python-ulid
|
||||
|
||||
Reference in New Issue
Block a user