completed local setup with compose
Some checks failed
CI/CD Pipeline / Generate SBOM (push) Has been cancelled
CI/CD Pipeline / Deploy to Staging (push) Has been cancelled
CI/CD Pipeline / Deploy to Production (push) Has been cancelled
CI/CD Pipeline / Code Quality & Linting (push) Has been cancelled
CI/CD Pipeline / Policy Validation (push) Has been cancelled
CI/CD Pipeline / Test Suite (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-firm-connectors) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-forms) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-hmrc) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ingestion) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-normalize-map) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ocr) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-indexer) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-reason) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rpa) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (ui-review) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (ui-review) (push) Has been cancelled
CI/CD Pipeline / Notifications (push) Has been cancelled
Some checks failed
CI/CD Pipeline / Generate SBOM (push) Has been cancelled
CI/CD Pipeline / Deploy to Staging (push) Has been cancelled
CI/CD Pipeline / Deploy to Production (push) Has been cancelled
CI/CD Pipeline / Code Quality & Linting (push) Has been cancelled
CI/CD Pipeline / Policy Validation (push) Has been cancelled
CI/CD Pipeline / Test Suite (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-firm-connectors) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-forms) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-hmrc) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ingestion) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-normalize-map) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ocr) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-indexer) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-reason) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rpa) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (ui-review) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (ui-review) (push) Has been cancelled
CI/CD Pipeline / Notifications (push) Has been cancelled
This commit is contained in:
@@ -13,9 +13,10 @@ ENV PATH="/opt/venv/bin:$PATH"
|
||||
|
||||
# Copy requirements and install dependencies
|
||||
COPY libs/requirements-base.txt /tmp/libs-requirements.txt
|
||||
COPY libs/requirements-ml.txt /tmp/libs-ml-requirements.txt
|
||||
COPY apps/svc_extract/requirements.txt /tmp/requirements.txt
|
||||
RUN pip install --no-cache-dir --upgrade pip && \
|
||||
pip install --no-cache-dir -r /tmp/libs-requirements.txt -r /tmp/requirements.txt
|
||||
pip install --no-cache-dir -r /tmp/libs-requirements.txt -r /tmp/libs-ml-requirements.txt -r /tmp/requirements.txt
|
||||
|
||||
# Production stage
|
||||
FROM python:3.12-slim
|
||||
|
||||
@@ -43,7 +43,7 @@ RUN chown -R appuser:appuser /app
|
||||
USER appuser
|
||||
|
||||
# Health check
|
||||
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
|
||||
HEALTHCHECK --interval=30s --timeout=10s --start-period=30s --retries=3 \
|
||||
CMD curl -f http://localhost:8000/healthz || exit 1
|
||||
|
||||
# Expose port
|
||||
|
||||
@@ -44,7 +44,7 @@ RUN chown -R appuser:appuser /app
|
||||
USER appuser
|
||||
|
||||
# Health check
|
||||
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
|
||||
HEALTHCHECK --interval=30s --timeout=10s --start-period=30s --retries=3 \
|
||||
CMD curl -f http://localhost:8000/healthz || exit 1
|
||||
|
||||
# Expose port
|
||||
|
||||
@@ -158,13 +158,13 @@ async def upload_document(
|
||||
event_payload = EventPayload(
|
||||
data={
|
||||
"doc_id": doc_id,
|
||||
"tenant_id": tenant_id,
|
||||
"filename": file.filename or "unknown",
|
||||
"kind": kind.value,
|
||||
"source": source,
|
||||
"checksum": checksum,
|
||||
"file_size": len(content),
|
||||
"content_type": content_type,
|
||||
"s3_url": storage_result["s3_url"],
|
||||
"checksum_sha256": checksum,
|
||||
"size_bytes": len(content),
|
||||
"mime_type": content_type,
|
||||
"storage_path": storage_result["s3_url"],
|
||||
},
|
||||
actor=current_user.get("sub", "system"),
|
||||
tenant_id=tenant_id,
|
||||
|
||||
@@ -1,54 +1,27 @@
|
||||
# Multi-stage build for svc_kg
|
||||
FROM python:3.12-slim AS builder
|
||||
FROM python:3.12-slim-bookworm
|
||||
|
||||
# Install build dependencies
|
||||
RUN apt-get update && apt-get install -y \
|
||||
build-essential \
|
||||
curl \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
# Set environment variables
|
||||
ENV PYTHONUNBUFFERED 1
|
||||
ENV APP_HOME /app
|
||||
|
||||
# Create virtual environment
|
||||
RUN python -m venv /opt/venv
|
||||
ENV PATH="/opt/venv/bin:$PATH"
|
||||
# Create and set working directory
|
||||
WORKDIR $APP_HOME
|
||||
|
||||
# Copy requirements and install dependencies
|
||||
# Install dependencies
|
||||
COPY libs/requirements-base.txt /tmp/libs-requirements.txt
|
||||
COPY libs/requirements-rdf.txt /tmp/libs-rdf.txt
|
||||
COPY apps/svc_kg/requirements.txt /tmp/requirements.txt
|
||||
RUN pip install --no-cache-dir --upgrade pip && \
|
||||
pip install --no-cache-dir -r /tmp/libs-requirements.txt -r /tmp/libs-rdf.txt -r /tmp/requirements.txt
|
||||
|
||||
# Production stage
|
||||
FROM python:3.12-slim
|
||||
|
||||
# Install runtime dependencies
|
||||
RUN apt-get update && apt-get install -y \
|
||||
curl \
|
||||
&& rm -rf /var/lib/apt/lists/* \
|
||||
&& groupadd -r appuser \
|
||||
&& useradd -r -g appuser appuser
|
||||
|
||||
# Copy virtual environment from builder
|
||||
COPY --from=builder /opt/venv /opt/venv
|
||||
ENV PATH="/opt/venv/bin:$PATH"
|
||||
|
||||
# Set working directory
|
||||
WORKDIR /app
|
||||
RUN pip install --no-cache-dir -r /tmp/libs-requirements.txt -r /tmp/requirements.txt
|
||||
|
||||
# Copy application code
|
||||
COPY libs/ ./libs/
|
||||
COPY apps/svc_kg/ ./apps/svc_kg/
|
||||
|
||||
# Create non-root user and set permissions
|
||||
RUN chown -R appuser:appuser /app
|
||||
USER appuser
|
||||
|
||||
# Health check
|
||||
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
|
||||
CMD curl -f http://localhost:8000/healthz || exit 1
|
||||
|
||||
# Expose port
|
||||
|
||||
EXPOSE 8000
|
||||
|
||||
|
||||
|
||||
# Run the application
|
||||
|
||||
CMD ["python", "-m", "uvicorn", "apps.svc_kg.main:app", "--host", "0.0.0.0", "--port", "8000"]
|
||||
|
||||
@@ -1,28 +1,22 @@
|
||||
# FILE: apps/svc-kg/main.py
|
||||
|
||||
# Knowledge graph facade with CRUD, queries, lineage, and SHACL validation
|
||||
|
||||
import json
|
||||
import os
|
||||
|
||||
# Import shared libraries
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from typing import Any
|
||||
from typing import Any, cast
|
||||
|
||||
import structlog
|
||||
from fastapi import Depends, HTTPException, Query, Request
|
||||
from fastapi import HTTPException, Request
|
||||
from fastapi.responses import JSONResponse
|
||||
from pyshacl import validate
|
||||
from rdflib import Graph, Literal, URIRef
|
||||
from rdflib.namespace import RDF
|
||||
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
|
||||
|
||||
from libs.app_factory import create_app
|
||||
from libs.config import BaseAppSettings, create_event_bus, create_neo4j_client
|
||||
from libs.events import EventBus
|
||||
from libs.neo import Neo4jClient, SHACLValidator, TemporalQueries
|
||||
from libs.events import EventBus, EventPayload, EventTopics
|
||||
from libs.neo import Neo4jClient
|
||||
from libs.observability import get_metrics, get_tracer, setup_observability
|
||||
from libs.schemas import ErrorResponse
|
||||
from libs.security import get_current_user, get_tenant_id
|
||||
|
||||
logger = structlog.get_logger()
|
||||
|
||||
@@ -31,523 +25,193 @@ class KGSettings(BaseAppSettings):
|
||||
"""Settings for KG service"""
|
||||
|
||||
service_name: str = "svc-kg"
|
||||
shacl_shapes_path: str = "schemas/shapes.ttl"
|
||||
|
||||
# SHACL validation
|
||||
shapes_file: str = "schemas/shapes.ttl"
|
||||
validate_on_write: bool = True
|
||||
|
||||
# Query limits
|
||||
max_results: int = 1000
|
||||
max_depth: int = 10
|
||||
query_timeout: int = 30
|
||||
|
||||
|
||||
# Create app and settings
|
||||
app, settings = create_app(
|
||||
service_name="svc-kg",
|
||||
title="Tax Agent Knowledge Graph Service",
|
||||
description="Knowledge graph facade with CRUD and queries",
|
||||
settings_class=KGSettings,
|
||||
)
|
||||
|
||||
# Global clients
|
||||
neo4j_client: Neo4jClient | None = None
|
||||
shacl_validator: SHACLValidator | None = None
|
||||
event_bus: EventBus | None = None
|
||||
tracer = get_tracer("svc-kg")
|
||||
metrics = get_metrics()
|
||||
shapes_graph: Graph | None = None
|
||||
|
||||
settings: KGSettings
|
||||
|
||||
|
||||
@app.on_event("startup")
|
||||
async def startup_event() -> None:
|
||||
async def init_dependencies(app_settings: KGSettings) -> None:
|
||||
"""Initialize service dependencies"""
|
||||
global neo4j_client, shacl_validator, event_bus
|
||||
global neo4j_client, event_bus, settings, shapes_graph
|
||||
|
||||
settings = app_settings
|
||||
logger.info("Starting KG service")
|
||||
|
||||
# Setup observability
|
||||
setup_observability(settings)
|
||||
|
||||
# Initialize Neo4j client
|
||||
neo4j_driver = create_neo4j_client(settings)
|
||||
neo4j_client = Neo4jClient(neo4j_driver)
|
||||
|
||||
# Initialize SHACL validator
|
||||
if os.path.exists(settings.shapes_file):
|
||||
shacl_validator = SHACLValidator(settings.shapes_file)
|
||||
|
||||
# Initialize event bus
|
||||
event_bus = create_event_bus(settings)
|
||||
if not event_bus:
|
||||
raise HTTPException(status_code=500, detail="Event bus not initialized")
|
||||
await event_bus.start()
|
||||
|
||||
logger.info("KG service started successfully")
|
||||
await event_bus.subscribe(EventTopics.KG_UPSERT_READY, _handle_kg_upsert_ready)
|
||||
|
||||
# Load SHACL shapes
|
||||
try:
|
||||
shapes_graph = Graph().parse(settings.shacl_shapes_path, format="turtle")
|
||||
logger.info("SHACL shapes loaded successfully")
|
||||
except Exception as e:
|
||||
logger.error("Failed to load SHACL shapes", error=str(e))
|
||||
shapes_graph = None
|
||||
|
||||
|
||||
app, _settings = create_app(
|
||||
service_name="svc-kg",
|
||||
title="Tax Agent Knowledge Graph Service",
|
||||
description="Service for managing and validating the Knowledge Graph",
|
||||
settings_class=KGSettings,
|
||||
)
|
||||
|
||||
|
||||
# Initialize dependencies immediately
|
||||
@app.on_event("startup")
|
||||
async def startup_event():
|
||||
await init_dependencies(cast(KGSettings, _settings))
|
||||
|
||||
|
||||
tracer = get_tracer("svc-kg")
|
||||
metrics = get_metrics()
|
||||
|
||||
|
||||
@app.on_event("shutdown")
|
||||
async def shutdown_event() -> None:
|
||||
"""Cleanup service dependencies"""
|
||||
global neo4j_client, event_bus
|
||||
global event_bus, neo4j_client
|
||||
|
||||
logger.info("Shutting down KG service")
|
||||
|
||||
if neo4j_client:
|
||||
await neo4j_client.close()
|
||||
|
||||
if event_bus:
|
||||
await event_bus.stop()
|
||||
|
||||
if neo4j_client:
|
||||
await neo4j_client.close()
|
||||
logger.info("KG service shutdown complete")
|
||||
|
||||
|
||||
@app.get("/health")
|
||||
async def health_check() -> dict[str, Any]:
|
||||
"""Health check endpoint"""
|
||||
return {
|
||||
"status": "healthy",
|
||||
"service": settings.service_name,
|
||||
"version": settings.service_version,
|
||||
"timestamp": datetime.utcnow().isoformat(),
|
||||
}
|
||||
async def _handle_kg_upsert_ready(topic: str, payload: EventPayload) -> None:
|
||||
"""Handle KG upsert ready events"""
|
||||
data = payload.data
|
||||
nodes = data.get("nodes", [])
|
||||
relationships = data.get("relationships", [])
|
||||
document_id = data.get("document_id")
|
||||
tenant_id = data.get("tenant_id")
|
||||
|
||||
if not nodes and not relationships:
|
||||
logger.warning("No nodes or relationships to upsert", data=data)
|
||||
return
|
||||
|
||||
@app.post("/nodes/{label}")
|
||||
async def create_node(
|
||||
label: str,
|
||||
properties: dict[str, Any],
|
||||
current_user: dict[str, Any] = Depends(get_current_user),
|
||||
tenant_id: str = Depends(get_tenant_id),
|
||||
) -> dict[str, Any]:
|
||||
"""Create a new node"""
|
||||
|
||||
with tracer.start_as_current_span("create_node") as span:
|
||||
span.set_attribute("label", label)
|
||||
with tracer.start_as_current_span("upsert_kg_data") as span:
|
||||
span.set_attribute("document_id", document_id)
|
||||
span.set_attribute("tenant_id", tenant_id)
|
||||
span.set_attribute("node_count", len(nodes))
|
||||
span.set_attribute("relationship_count", len(relationships))
|
||||
|
||||
try:
|
||||
# Add tenant isolation
|
||||
properties["tenant_id"] = tenant_id
|
||||
properties["created_by"] = current_user.get("sub", "system")
|
||||
|
||||
# Validate with SHACL if enabled
|
||||
if settings.validate_on_write and shacl_validator:
|
||||
await _validate_node(label, properties)
|
||||
|
||||
# Create node
|
||||
result = await neo4j_client.create_node(label, properties)
|
||||
|
||||
# Update metrics
|
||||
metrics.counter("nodes_created_total").labels(
|
||||
tenant_id=tenant_id, label=label
|
||||
).inc()
|
||||
|
||||
logger.info("Node created", label=label, node_id=result.get("id"))
|
||||
|
||||
return {
|
||||
"status": "created",
|
||||
"label": label,
|
||||
"properties": properties,
|
||||
"neo4j_result": result,
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Failed to create node", label=label, error=str(e))
|
||||
raise HTTPException(
|
||||
status_code=500, detail=f"Failed to create node: {str(e)}"
|
||||
# 1. Validate data against SHACL schema
|
||||
conforms, validation_report = await _validate_with_shacl(
|
||||
nodes, relationships
|
||||
)
|
||||
if not conforms:
|
||||
logger.error(
|
||||
"SHACL validation failed",
|
||||
document_id=document_id,
|
||||
validation_report=validation_report,
|
||||
)
|
||||
metrics.counter("kg_validation_errors_total").labels(
|
||||
tenant_id=tenant_id
|
||||
).inc()
|
||||
return
|
||||
|
||||
# 2. Write data to Neo4j
|
||||
for node in nodes:
|
||||
await neo4j_client.create_node(node["type"], node["properties"]) # type: ignore
|
||||
|
||||
@app.get("/nodes/{label}")
|
||||
async def get_nodes(
|
||||
label: str,
|
||||
limit: int = Query(default=100, le=settings.max_results),
|
||||
filters: str | None = Query(default=None),
|
||||
current_user: dict[str, Any] = Depends(get_current_user),
|
||||
tenant_id: str = Depends(get_tenant_id),
|
||||
) -> dict[str, Any]:
|
||||
"""Get nodes by label with optional filters"""
|
||||
for rel in relationships:
|
||||
await neo4j_client.create_relationship( # type: ignore
|
||||
rel["sourceId"],
|
||||
rel["targetId"],
|
||||
rel["type"],
|
||||
rel["properties"],
|
||||
)
|
||||
|
||||
with tracer.start_as_current_span("get_nodes") as span:
|
||||
span.set_attribute("label", label)
|
||||
span.set_attribute("tenant_id", tenant_id)
|
||||
span.set_attribute("limit", limit)
|
||||
|
||||
try:
|
||||
# Parse filters
|
||||
filter_dict: dict[str, Any] = {}
|
||||
if filters:
|
||||
try:
|
||||
filter_dict = json.loads(filters)
|
||||
except json.JSONDecodeError:
|
||||
raise HTTPException(status_code=400, detail="Invalid filters JSON")
|
||||
|
||||
# Add tenant isolation
|
||||
filter_dict["tenant_id"] = tenant_id
|
||||
|
||||
# Build query
|
||||
query = TemporalQueries.get_current_state_query(label, filter_dict)
|
||||
query += f" LIMIT {limit}"
|
||||
|
||||
# Execute query
|
||||
results = await neo4j_client.run_query(query)
|
||||
|
||||
# Update metrics
|
||||
metrics.counter("nodes_queried_total").labels(
|
||||
tenant_id=tenant_id, label=label
|
||||
).inc()
|
||||
|
||||
return {
|
||||
"label": label,
|
||||
"count": len(results),
|
||||
"nodes": [result["n"] for result in results],
|
||||
}
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error("Failed to get nodes", label=label, error=str(e))
|
||||
raise HTTPException(
|
||||
status_code=500, detail=f"Failed to get nodes: {str(e)}"
|
||||
# 3. Publish kg.upserted event
|
||||
event_payload = EventPayload(
|
||||
data={
|
||||
"document_id": document_id,
|
||||
"tenant_id": tenant_id,
|
||||
"taxpayer_id": data.get("taxpayer_id"),
|
||||
"tax_year": data.get("tax_year"),
|
||||
"node_count": len(nodes),
|
||||
"relationship_count": len(relationships),
|
||||
},
|
||||
actor=payload.actor,
|
||||
tenant_id=tenant_id,
|
||||
trace_id=str(span.get_span_context().trace_id),
|
||||
)
|
||||
await event_bus.publish(EventTopics.KG_UPSERTED, event_payload) # type: ignore
|
||||
|
||||
|
||||
@app.get("/nodes/{label}/{node_id}")
|
||||
async def get_node(
|
||||
label: str,
|
||||
node_id: str,
|
||||
include_lineage: bool = Query(default=False),
|
||||
current_user: dict[str, Any] = Depends(get_current_user),
|
||||
tenant_id: str = Depends(get_tenant_id),
|
||||
) -> dict[str, Any]:
|
||||
"""Get specific node with optional lineage"""
|
||||
|
||||
with tracer.start_as_current_span("get_node") as span:
|
||||
span.set_attribute("label", label)
|
||||
span.set_attribute("node_id", node_id)
|
||||
span.set_attribute("tenant_id", tenant_id)
|
||||
|
||||
try:
|
||||
# Get node
|
||||
query = f"""
|
||||
MATCH (n:{label} {{id: $node_id, tenant_id: $tenant_id}})
|
||||
WHERE n.retracted_at IS NULL
|
||||
RETURN n
|
||||
"""
|
||||
|
||||
results = await neo4j_client.run_query(
|
||||
query, {"node_id": node_id, "tenant_id": tenant_id}
|
||||
)
|
||||
|
||||
if not results:
|
||||
raise HTTPException(status_code=404, detail="Node not found")
|
||||
|
||||
node_data = results[0]["n"]
|
||||
|
||||
# Get lineage if requested
|
||||
lineage: list[dict[str, Any]] = []
|
||||
if include_lineage:
|
||||
lineage = await neo4j_client.get_node_lineage(node_id)
|
||||
|
||||
return {"node": node_data, "lineage": lineage if include_lineage else None}
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
"Failed to get node", label=label, node_id=node_id, error=str(e)
|
||||
)
|
||||
raise HTTPException(status_code=500, detail=f"Failed to get node: {str(e)}")
|
||||
|
||||
|
||||
@app.put("/nodes/{label}/{node_id}")
|
||||
async def update_node(
|
||||
label: str,
|
||||
node_id: str,
|
||||
properties: dict[str, Any],
|
||||
current_user: dict[str, Any] = Depends(get_current_user),
|
||||
tenant_id: str = Depends(get_tenant_id),
|
||||
) -> dict[str, Any]:
|
||||
"""Update node with bitemporal versioning"""
|
||||
|
||||
with tracer.start_as_current_span("update_node") as span:
|
||||
span.set_attribute("label", label)
|
||||
span.set_attribute("node_id", node_id)
|
||||
span.set_attribute("tenant_id", tenant_id)
|
||||
|
||||
try:
|
||||
# Add metadata
|
||||
properties["tenant_id"] = tenant_id
|
||||
properties["updated_by"] = current_user.get("sub", "system")
|
||||
|
||||
# Validate with SHACL if enabled
|
||||
if settings.validate_on_write and shacl_validator:
|
||||
await _validate_node(label, properties)
|
||||
|
||||
# Update node (creates new version)
|
||||
await neo4j_client.update_node(label, node_id, properties)
|
||||
|
||||
# Update metrics
|
||||
metrics.counter("nodes_updated_total").labels(
|
||||
tenant_id=tenant_id, label=label
|
||||
).inc()
|
||||
|
||||
logger.info("Node updated", label=label, node_id=node_id)
|
||||
|
||||
return {
|
||||
"status": "updated",
|
||||
"label": label,
|
||||
"node_id": node_id,
|
||||
"properties": properties,
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
"Failed to update node", label=label, node_id=node_id, error=str(e)
|
||||
)
|
||||
raise HTTPException(
|
||||
status_code=500, detail=f"Failed to update node: {str(e)}"
|
||||
)
|
||||
|
||||
|
||||
@app.post("/relationships")
|
||||
async def create_relationship(
|
||||
from_label: str,
|
||||
from_id: str,
|
||||
to_label: str,
|
||||
to_id: str,
|
||||
relationship_type: str,
|
||||
properties: dict[str, Any] | None = None,
|
||||
current_user: dict[str, Any] = Depends(get_current_user),
|
||||
tenant_id: str = Depends(get_tenant_id),
|
||||
) -> dict[str, Any]:
|
||||
"""Create relationship between nodes"""
|
||||
|
||||
with tracer.start_as_current_span("create_relationship") as span:
|
||||
span.set_attribute("from_label", from_label)
|
||||
span.set_attribute("to_label", to_label)
|
||||
span.set_attribute("relationship_type", relationship_type)
|
||||
span.set_attribute("tenant_id", tenant_id)
|
||||
|
||||
try:
|
||||
# Add metadata
|
||||
rel_properties = properties or {}
|
||||
rel_properties["tenant_id"] = tenant_id
|
||||
rel_properties["created_by"] = current_user.get("sub", "system")
|
||||
|
||||
# Create relationship
|
||||
await neo4j_client.create_relationship(
|
||||
from_label, from_id, to_label, to_id, relationship_type, rel_properties
|
||||
)
|
||||
|
||||
# Update metrics
|
||||
metrics.counter("relationships_created_total").labels(
|
||||
tenant_id=tenant_id, relationship_type=relationship_type
|
||||
).inc()
|
||||
|
||||
metrics.counter("kg_upserts_total").labels(tenant_id=tenant_id).inc()
|
||||
logger.info(
|
||||
"Relationship created",
|
||||
from_id=from_id,
|
||||
to_id=to_id,
|
||||
type=relationship_type,
|
||||
"KG upsert completed", document_id=document_id, tenant_id=tenant_id
|
||||
)
|
||||
|
||||
return {
|
||||
"status": "created",
|
||||
"from_id": from_id,
|
||||
"to_id": to_id,
|
||||
"relationship_type": relationship_type,
|
||||
"properties": rel_properties,
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Failed to create relationship", error=str(e))
|
||||
raise HTTPException(
|
||||
status_code=500, detail=f"Failed to create relationship: {str(e)}"
|
||||
logger.error(
|
||||
"Failed to upsert KG data", document_id=document_id, error=str(e)
|
||||
)
|
||||
|
||||
|
||||
@app.post("/query")
|
||||
async def execute_query(
|
||||
query: str,
|
||||
parameters: dict[str, Any] | None = None,
|
||||
current_user: dict[str, Any] = Depends(get_current_user),
|
||||
tenant_id: str = Depends(get_tenant_id),
|
||||
) -> dict[str, Any]:
|
||||
"""Execute custom Cypher query with tenant isolation"""
|
||||
|
||||
with tracer.start_as_current_span("execute_query") as span:
|
||||
span.set_attribute("tenant_id", tenant_id)
|
||||
|
||||
try:
|
||||
# Add tenant isolation to parameters
|
||||
query_params = parameters or {}
|
||||
query_params["tenant_id"] = tenant_id
|
||||
|
||||
# Validate query (basic security check)
|
||||
if not _is_safe_query(query):
|
||||
raise HTTPException(status_code=400, detail="Unsafe query detected")
|
||||
|
||||
# Execute query with timeout
|
||||
results = await neo4j_client.run_query(query, query_params, max_retries=1)
|
||||
|
||||
# Update metrics
|
||||
metrics.counter("custom_queries_total").labels(tenant_id=tenant_id).inc()
|
||||
|
||||
return {
|
||||
"query": query,
|
||||
"parameters": query_params,
|
||||
"results": results,
|
||||
"count": len(results),
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Query execution failed", query=query[:100], error=str(e))
|
||||
raise HTTPException(status_code=500, detail=f"Query failed: {str(e)}")
|
||||
|
||||
|
||||
@app.get("/export/rdf")
|
||||
async def export_rdf(
|
||||
format: str = Query(default="turtle"),
|
||||
current_user: dict[str, Any] = Depends(get_current_user),
|
||||
tenant_id: str = Depends(get_tenant_id),
|
||||
) -> dict[str, Any]:
|
||||
"""Export knowledge graph as RDF"""
|
||||
|
||||
with tracer.start_as_current_span("export_rdf") as span:
|
||||
span.set_attribute("format", format)
|
||||
span.set_attribute("tenant_id", tenant_id)
|
||||
|
||||
try:
|
||||
# Export tenant-specific data
|
||||
rdf_data = await neo4j_client.export_to_rdf(format)
|
||||
|
||||
# Update metrics
|
||||
metrics.counter("rdf_exports_total").labels(
|
||||
tenant_id=tenant_id, format=format
|
||||
metrics.counter("kg_upsert_errors_total").labels(
|
||||
tenant_id=tenant_id, error_type=type(e).__name__
|
||||
).inc()
|
||||
|
||||
return {
|
||||
"format": format,
|
||||
"rdf_data": rdf_data,
|
||||
"exported_at": datetime.utcnow().isoformat(),
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error("RDF export failed", format=format, error=str(e))
|
||||
raise HTTPException(
|
||||
status_code=500, detail=f"RDF export failed: {str(e)}"
|
||||
) from e
|
||||
async def _validate_with_shacl(
|
||||
nodes: list[dict[str, Any]], relationships: list[dict[str, Any]]
|
||||
) -> tuple[bool, str]:
|
||||
"""Validate data against SHACL shapes."""
|
||||
if not shapes_graph:
|
||||
logger.warning("SHACL shapes not loaded, skipping validation.")
|
||||
return True, "SHACL shapes not loaded"
|
||||
|
||||
data_graph = Graph()
|
||||
namespace = "http://ai-tax-agent.com/ontology/"
|
||||
|
||||
@app.post("/validate")
|
||||
async def validate_graph(
|
||||
current_user: dict[str, Any] = Depends(get_current_user),
|
||||
tenant_id: str = Depends(get_tenant_id),
|
||||
) -> dict[str, Any]:
|
||||
"""Validate knowledge graph with SHACL"""
|
||||
for node in nodes:
|
||||
node_uri = URIRef(f"{namespace}{node['id']}")
|
||||
data_graph.add((node_uri, RDF.type, URIRef(f"{namespace}{node['type']}")))
|
||||
for key, value in node["properties"].items():
|
||||
if value is not None:
|
||||
data_graph.add((node_uri, URIRef(f"{namespace}{key}"), Literal(value)))
|
||||
|
||||
with tracer.start_as_current_span("validate_graph") as span:
|
||||
span.set_attribute("tenant_id", tenant_id)
|
||||
|
||||
try:
|
||||
if not shacl_validator:
|
||||
raise HTTPException(
|
||||
status_code=501, detail="SHACL validation not configured"
|
||||
)
|
||||
|
||||
# Export current graph state
|
||||
rdf_export = await neo4j_client.export_to_rdf("turtle")
|
||||
|
||||
# Extract RDF data from export result
|
||||
rdf_data = rdf_export.get("rdf_data", "")
|
||||
if not rdf_data:
|
||||
raise HTTPException(
|
||||
status_code=500, detail="Failed to export RDF data for validation"
|
||||
)
|
||||
|
||||
# Run SHACL validation
|
||||
validation_result = await shacl_validator.validate_graph(rdf_data)
|
||||
|
||||
# Update metrics
|
||||
metrics.counter("validations_total").labels(
|
||||
tenant_id=tenant_id, conforms=validation_result["conforms"]
|
||||
).inc()
|
||||
|
||||
return {
|
||||
"conforms": validation_result["conforms"],
|
||||
"violations_count": validation_result["violations_count"],
|
||||
"results_text": validation_result["results_text"],
|
||||
"validated_at": datetime.utcnow().isoformat(),
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Graph validation failed", error=str(e))
|
||||
raise HTTPException(status_code=500, detail=f"Validation failed: {str(e)}")
|
||||
|
||||
|
||||
async def _validate_node(label: str, properties: dict[str, Any]) -> bool:
|
||||
"""Validate node with SHACL"""
|
||||
if not shacl_validator:
|
||||
return True
|
||||
for rel in relationships:
|
||||
source_uri = URIRef(f"{namespace}{rel['sourceId']}")
|
||||
target_uri = URIRef(f"{namespace}{rel['targetId']}")
|
||||
rel_uri = URIRef(f"{namespace}{rel['type']}")
|
||||
data_graph.add((source_uri, rel_uri, target_uri))
|
||||
|
||||
try:
|
||||
# Create a minimal RDF representation of the node for validation
|
||||
rdf_lines = ["@prefix tax: <https://tax-kg.example.com/> ."]
|
||||
node_uri = "tax:temp_node"
|
||||
|
||||
# Add type declaration
|
||||
rdf_lines.append(f"{node_uri} a tax:{label} .")
|
||||
|
||||
# Add properties
|
||||
for prop, value in properties.items():
|
||||
if isinstance(value, str):
|
||||
rdf_lines.append(f'{node_uri} tax:{prop} "{value}" .')
|
||||
else:
|
||||
rdf_lines.append(f"{node_uri} tax:{prop} {value} .")
|
||||
|
||||
rdf_data = "\n".join(rdf_lines)
|
||||
|
||||
# Validate the node RDF data
|
||||
validation_result = await shacl_validator.validate_graph(rdf_data)
|
||||
|
||||
if not validation_result["conforms"]:
|
||||
logger.warning(
|
||||
"Node SHACL validation failed",
|
||||
label=label,
|
||||
violations=validation_result["violations_count"],
|
||||
details=validation_result["results_text"],
|
||||
)
|
||||
return False
|
||||
|
||||
logger.debug("Node SHACL validation passed", label=label)
|
||||
return True
|
||||
|
||||
conforms, results_graph, results_text = validate(
|
||||
data_graph,
|
||||
shacl_graph=shapes_graph,
|
||||
ont_graph=None, # No ontology graph
|
||||
inference="rdfs",
|
||||
abort_on_first=False,
|
||||
allow_infos=False,
|
||||
meta_shacl=False,
|
||||
advanced=False,
|
||||
js=False,
|
||||
debug=False,
|
||||
)
|
||||
return conforms, results_text
|
||||
except Exception as e:
|
||||
logger.error("Node SHACL validation error", label=label, error=str(e))
|
||||
# Return True to not block operations on validation errors
|
||||
return True
|
||||
|
||||
|
||||
def _is_safe_query(query: str) -> bool:
|
||||
"""Basic query safety check"""
|
||||
query_lower = query.lower()
|
||||
|
||||
# Block dangerous operations
|
||||
dangerous_keywords = [
|
||||
"delete",
|
||||
"remove",
|
||||
"drop",
|
||||
"create index",
|
||||
"create constraint",
|
||||
"load csv",
|
||||
"call",
|
||||
"foreach",
|
||||
]
|
||||
|
||||
for keyword in dangerous_keywords:
|
||||
if keyword in query_lower:
|
||||
return False
|
||||
|
||||
return True
|
||||
logger.error("Error during SHACL validation", error=str(e))
|
||||
return False, str(e)
|
||||
|
||||
|
||||
@app.exception_handler(HTTPException)
|
||||
@@ -561,7 +225,7 @@ async def http_exception_handler(request: Request, exc: HTTPException) -> JSONRe
|
||||
status=exc.status_code,
|
||||
detail=exc.detail,
|
||||
instance=str(request.url),
|
||||
trace_id="",
|
||||
trace_id=getattr(request.state, "trace_id", None),
|
||||
).model_dump(),
|
||||
)
|
||||
|
||||
|
||||
@@ -1,22 +1,2 @@
|
||||
# Service-specific dependencies
|
||||
# RDF and semantic web
|
||||
rdflib>=7.2.1
|
||||
pyshacl>=0.30.1
|
||||
|
||||
# Graph algorithms
|
||||
networkx>=3.5
|
||||
|
||||
# Data export formats
|
||||
xmltodict>=1.0.2
|
||||
|
||||
# Query optimization
|
||||
pyparsing>=3.2.5
|
||||
|
||||
# Graph visualization (optional)
|
||||
graphviz>=0.21
|
||||
|
||||
# Additional Neo4j utilities
|
||||
neomodel>=5.5.3
|
||||
|
||||
# Cypher query building
|
||||
py2neo>=2021.2.4
|
||||
setuptools
|
||||
pyshacl==0.23.0
|
||||
|
||||
@@ -1,53 +1,27 @@
|
||||
# Multi-stage build for svc_normalize_map
|
||||
FROM python:3.12-slim AS builder
|
||||
FROM python:3.12-slim-bookworm
|
||||
|
||||
# Install build dependencies
|
||||
RUN apt-get update && apt-get install -y \
|
||||
build-essential \
|
||||
curl \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
# Set environment variables
|
||||
ENV PYTHONUNBUFFERED 1
|
||||
ENV APP_HOME /app
|
||||
|
||||
# Create virtual environment
|
||||
RUN python -m venv /opt/venv
|
||||
ENV PATH="/opt/venv/bin:$PATH"
|
||||
# Create and set working directory
|
||||
WORKDIR $APP_HOME
|
||||
|
||||
# Copy requirements and install dependencies
|
||||
# Install dependencies
|
||||
COPY libs/requirements-base.txt /tmp/libs-requirements.txt
|
||||
COPY apps/svc_normalize_map/requirements.txt /tmp/requirements.txt
|
||||
RUN pip install --no-cache-dir --upgrade pip && \
|
||||
pip install --no-cache-dir -r /tmp/libs-requirements.txt -r /tmp/requirements.txt
|
||||
|
||||
# Production stage
|
||||
FROM python:3.12-slim
|
||||
|
||||
# Install runtime dependencies
|
||||
RUN apt-get update && apt-get install -y \
|
||||
curl \
|
||||
&& rm -rf /var/lib/apt/lists/* \
|
||||
&& groupadd -r appuser \
|
||||
&& useradd -r -g appuser appuser
|
||||
|
||||
# Copy virtual environment from builder
|
||||
COPY --from=builder /opt/venv /opt/venv
|
||||
ENV PATH="/opt/venv/bin:$PATH"
|
||||
|
||||
# Set working directory
|
||||
WORKDIR /app
|
||||
RUN pip install --no-cache-dir -r /tmp/libs-requirements.txt -r /tmp/requirements.txt
|
||||
|
||||
# Copy application code
|
||||
COPY libs/ ./libs/
|
||||
COPY apps/svc_normalize_map/ ./apps/svc_normalize_map/
|
||||
|
||||
# Create non-root user and set permissions
|
||||
RUN chown -R appuser:appuser /app
|
||||
USER appuser
|
||||
|
||||
# Health check
|
||||
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
|
||||
CMD curl -f http://localhost:8000/healthz || exit 1
|
||||
|
||||
# Expose port
|
||||
|
||||
EXPOSE 8000
|
||||
|
||||
|
||||
|
||||
# Run the application
|
||||
|
||||
CMD ["python", "-m", "uvicorn", "apps.svc_normalize_map.main:app", "--host", "0.0.0.0", "--port", "8000"]
|
||||
|
||||
@@ -1,24 +1,11 @@
|
||||
"""Data normalization and knowledge graph mapping."""
|
||||
|
||||
# FILE: apps/svc-normalize-map/main.py
|
||||
# pylint: disable=wrong-import-position,import-error,too-few-public-methods,global-statement
|
||||
# pylint: disable=global-variable-not-assigned,raise-missing-from,unused-argument
|
||||
# pylint: disable=broad-exception-caught,no-else-return,too-many-arguments,too-many-positional-arguments
|
||||
# pylint: disable=too-many-locals,import-outside-toplevel,too-many-statements
|
||||
# mypy: disable-error-code=union-attr
|
||||
|
||||
|
||||
import os
|
||||
|
||||
# Import shared libraries
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from decimal import Decimal
|
||||
from typing import Any
|
||||
from datetime import UTC, datetime
|
||||
from typing import Any, cast
|
||||
|
||||
import structlog
|
||||
import ulid
|
||||
from fastapi import BackgroundTasks, Depends, HTTPException, Request
|
||||
from fastapi import HTTPException, Request
|
||||
from fastapi.responses import JSONResponse
|
||||
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
|
||||
@@ -34,83 +21,68 @@ from libs.events import EventBus, EventPayload, EventTopics
|
||||
from libs.neo import Neo4jClient
|
||||
from libs.observability import get_metrics, get_tracer, setup_observability
|
||||
from libs.schemas import ErrorResponse
|
||||
from libs.security import get_current_user, get_tenant_id
|
||||
from libs.storage import DocumentStorage, StorageClient
|
||||
|
||||
logger = structlog.get_logger()
|
||||
|
||||
|
||||
class NormalizeMapSettings(BaseAppSettings):
|
||||
"""Settings for normalize-map service"""
|
||||
"""Settings for NormalizeMap service"""
|
||||
|
||||
service_name: str = "svc-normalize-map"
|
||||
|
||||
# Normalization configuration
|
||||
currency_default: str = "GBP"
|
||||
date_formats: list[str] = [
|
||||
"%Y-%m-%d",
|
||||
"%d/%m/%Y",
|
||||
"%d-%m-%Y",
|
||||
"%d %B %Y",
|
||||
"%d %b %Y",
|
||||
"%B %d, %Y",
|
||||
]
|
||||
|
||||
# Mapping configuration
|
||||
confidence_threshold: float = 0.7
|
||||
auto_create_entities: bool = True
|
||||
|
||||
# Validation rules
|
||||
max_amount: float = 1000000.0 # £1M
|
||||
min_confidence: float = 0.5
|
||||
|
||||
|
||||
# Create app and settings
|
||||
app, settings = create_app(
|
||||
service_name="svc-normalize-map",
|
||||
title="Tax Agent Normalize-Map Service",
|
||||
description="Data normalization and knowledge graph mapping service",
|
||||
settings_class=NormalizeMapSettings,
|
||||
)
|
||||
|
||||
# Global clients
|
||||
storage_client: StorageClient | None = None
|
||||
document_storage: DocumentStorage | None = None
|
||||
neo4j_client: Neo4jClient | None = None
|
||||
event_bus: EventBus | None = None
|
||||
tracer = get_tracer("svc-normalize-map")
|
||||
metrics = get_metrics()
|
||||
neo4j_client: Neo4jClient | None = None
|
||||
|
||||
settings: NormalizeMapSettings
|
||||
|
||||
|
||||
@app.on_event("startup")
|
||||
async def startup_event() -> None:
|
||||
async def init_dependencies(app_settings: NormalizeMapSettings) -> None:
|
||||
"""Initialize service dependencies"""
|
||||
global storage_client, document_storage, neo4j_client, event_bus
|
||||
global storage_client, document_storage, event_bus, neo4j_client, settings
|
||||
|
||||
logger.info("Starting normalize-map service")
|
||||
settings = app_settings
|
||||
logger.info("Starting NormalizeMap service")
|
||||
|
||||
# Setup observability
|
||||
setup_observability(settings)
|
||||
|
||||
# Initialize MinIO client
|
||||
minio_client = create_minio_client(settings)
|
||||
storage_client = StorageClient(minio_client)
|
||||
document_storage = DocumentStorage(storage_client)
|
||||
|
||||
# Initialize Neo4j client
|
||||
neo4j_driver = create_neo4j_client(settings)
|
||||
neo4j_client = Neo4jClient(neo4j_driver)
|
||||
|
||||
# Initialize event bus
|
||||
event_bus = create_event_bus(settings)
|
||||
if not event_bus:
|
||||
raise HTTPException(status_code=500, detail="Event bus not initialized")
|
||||
await event_bus.start()
|
||||
|
||||
# Subscribe to extraction completion events
|
||||
await event_bus.subscribe( # type: ignore
|
||||
EventTopics.DOC_EXTRACTED, _handle_extraction_completed
|
||||
)
|
||||
await event_bus.subscribe(EventTopics.DOC_EXTRACTED, _handle_document_extracted)
|
||||
|
||||
logger.info("Normalize-map service started successfully")
|
||||
logger.info("NormalizeMap service started successfully")
|
||||
|
||||
|
||||
app, _settings = create_app(
|
||||
service_name="svc-normalize-map",
|
||||
title="Tax Agent Normalize and Map Service",
|
||||
description="Normalize extracted data and map to Knowledge Graph",
|
||||
settings_class=NormalizeMapSettings,
|
||||
)
|
||||
|
||||
|
||||
# Initialize dependencies immediately
|
||||
@app.on_event("startup")
|
||||
async def startup_event(): # type: ignore
|
||||
await init_dependencies(cast(NormalizeMapSettings, _settings))
|
||||
|
||||
|
||||
tracer = get_tracer("svc-normalize-map")
|
||||
metrics = get_metrics()
|
||||
|
||||
|
||||
@app.on_event("shutdown")
|
||||
@@ -118,456 +90,235 @@ async def shutdown_event() -> None:
|
||||
"""Cleanup service dependencies"""
|
||||
global event_bus, neo4j_client
|
||||
|
||||
logger.info("Shutting down normalize-map service")
|
||||
|
||||
if neo4j_client:
|
||||
await neo4j_client.close()
|
||||
|
||||
logger.info("Shutting down NormalizeMap service")
|
||||
if event_bus:
|
||||
await event_bus.stop()
|
||||
|
||||
logger.info("Normalize-map service shutdown complete")
|
||||
if neo4j_client:
|
||||
await neo4j_client.close()
|
||||
logger.info("NormalizeMap service shutdown complete")
|
||||
|
||||
|
||||
@app.get("/health")
|
||||
async def health_check() -> dict[str, Any]:
|
||||
"""Health check endpoint"""
|
||||
return {
|
||||
"status": "healthy",
|
||||
"service": settings.service_name,
|
||||
"version": settings.service_version,
|
||||
"timestamp": datetime.utcnow().isoformat(),
|
||||
}
|
||||
async def _handle_document_extracted(topic: str, payload: EventPayload) -> None:
|
||||
"""Handle document extracted events"""
|
||||
data = payload.data
|
||||
doc_id = data.get("doc_id")
|
||||
tenant_id = data.get("tenant_id")
|
||||
extracted_fields = data.get("extraction_results", {}).get("extracted_fields", {})
|
||||
provenance = data.get("extraction_results", {}).get("provenance", [])
|
||||
|
||||
if not doc_id or not tenant_id or not extracted_fields:
|
||||
logger.warning("Invalid document extracted event", data=data)
|
||||
return
|
||||
|
||||
@app.post("/normalize/{doc_id}")
|
||||
async def normalize_document(
|
||||
doc_id: str,
|
||||
background_tasks: BackgroundTasks,
|
||||
current_user: dict[str, Any] = Depends(get_current_user),
|
||||
tenant_id: str = Depends(get_tenant_id),
|
||||
) -> dict[str, Any]:
|
||||
"""Normalize and map document data to knowledge graph"""
|
||||
|
||||
with tracer.start_as_current_span("normalize_document") as span:
|
||||
with tracer.start_as_current_span("normalize_and_map") as span:
|
||||
span.set_attribute("doc_id", doc_id)
|
||||
span.set_attribute("tenant_id", tenant_id)
|
||||
|
||||
try:
|
||||
# Check if extraction results exist
|
||||
extraction_results = await document_storage.get_extraction_result(
|
||||
tenant_id, doc_id
|
||||
)
|
||||
if not extraction_results:
|
||||
raise HTTPException(
|
||||
status_code=404, detail="Extraction results not found"
|
||||
)
|
||||
# 1. Normalize data
|
||||
normalized_data = await _normalize_data(extracted_fields)
|
||||
|
||||
# Generate normalization ID
|
||||
normalization_id = str(ulid.new())
|
||||
span.set_attribute("normalization_id", normalization_id)
|
||||
|
||||
# Start background normalization
|
||||
background_tasks.add_task(
|
||||
_normalize_and_map_async,
|
||||
doc_id,
|
||||
tenant_id,
|
||||
extraction_results,
|
||||
normalization_id,
|
||||
current_user.get("sub", "system"),
|
||||
# 2. Map to KG ontology
|
||||
kg_upsert_payload = await _map_to_kg_ontology(
|
||||
doc_id, tenant_id, normalized_data, provenance
|
||||
)
|
||||
|
||||
logger.info(
|
||||
"Normalization started",
|
||||
doc_id=doc_id,
|
||||
normalization_id=normalization_id,
|
||||
# 3. Publish kg.upsert.ready event
|
||||
event_payload = EventPayload(
|
||||
data=kg_upsert_payload,
|
||||
actor=payload.actor,
|
||||
tenant_id=tenant_id,
|
||||
trace_id=str(span.get_span_context().trace_id),
|
||||
)
|
||||
await event_bus.publish(EventTopics.KG_UPSERT_READY, event_payload) # type: ignore
|
||||
|
||||
return {
|
||||
"normalization_id": normalization_id,
|
||||
"doc_id": doc_id,
|
||||
"status": "processing",
|
||||
}
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error("Failed to start normalization", doc_id=doc_id, error=str(e))
|
||||
raise HTTPException(status_code=500, detail="Failed to start normalization")
|
||||
|
||||
|
||||
async def _handle_extraction_completed(topic: str, payload: EventPayload) -> None:
|
||||
"""Handle extraction completion events"""
|
||||
try:
|
||||
data = payload.data
|
||||
doc_id = data.get("doc_id")
|
||||
tenant_id = data.get("tenant_id")
|
||||
confidence = data.get("confidence", 0.0)
|
||||
|
||||
if not doc_id or not tenant_id:
|
||||
logger.warning("Invalid extraction completion event", data=data)
|
||||
return
|
||||
|
||||
# Only auto-process if confidence is above threshold
|
||||
if confidence >= settings.confidence_threshold:
|
||||
logger.info(
|
||||
"Auto-normalizing extracted document",
|
||||
doc_id=doc_id,
|
||||
confidence=confidence,
|
||||
)
|
||||
|
||||
extraction_results = data.get("extraction_results")
|
||||
if not extraction_results:
|
||||
extraction_results = await document_storage.get_extraction_result(
|
||||
tenant_id, doc_id
|
||||
)
|
||||
|
||||
if extraction_results:
|
||||
await _normalize_and_map_async(
|
||||
doc_id=doc_id,
|
||||
tenant_id=tenant_id,
|
||||
extraction_results=extraction_results,
|
||||
normalization_id=str(ulid.new()),
|
||||
actor=payload.actor,
|
||||
)
|
||||
else:
|
||||
logger.info(
|
||||
"Skipping auto-normalization due to low confidence",
|
||||
doc_id=doc_id,
|
||||
confidence=confidence,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Failed to handle extraction completion", error=str(e))
|
||||
|
||||
|
||||
async def _normalize_and_map_async(
|
||||
doc_id: str,
|
||||
tenant_id: str,
|
||||
extraction_results: dict[str, Any],
|
||||
normalization_id: str,
|
||||
actor: str,
|
||||
) -> None:
|
||||
"""Normalize and map data asynchronously"""
|
||||
|
||||
with tracer.start_as_current_span("normalize_and_map_async") as span:
|
||||
span.set_attribute("doc_id", doc_id)
|
||||
span.set_attribute("normalization_id", normalization_id)
|
||||
|
||||
try:
|
||||
extracted_fields = extraction_results.get("extracted_fields", {})
|
||||
provenance = extraction_results.get("provenance", [])
|
||||
|
||||
# Normalize extracted data
|
||||
normalized_data = await _normalize_data(extracted_fields, provenance)
|
||||
|
||||
# Map to knowledge graph entities
|
||||
entities = await _map_to_entities(normalized_data, doc_id, tenant_id)
|
||||
|
||||
# Store entities in knowledge graph
|
||||
stored_entities = await _store_entities(entities, tenant_id)
|
||||
|
||||
# Create normalization results
|
||||
normalization_results = {
|
||||
"doc_id": doc_id,
|
||||
"normalization_id": normalization_id,
|
||||
"normalized_at": datetime.utcnow().isoformat(),
|
||||
"normalized_data": normalized_data,
|
||||
"entities": stored_entities,
|
||||
"entity_count": len(stored_entities),
|
||||
}
|
||||
|
||||
logger.info("Normalization completed", results=normalization_results)
|
||||
|
||||
# Update metrics
|
||||
metrics.counter("documents_normalized_total").labels(
|
||||
metrics.counter("normalized_documents_total").labels(
|
||||
tenant_id=tenant_id
|
||||
).inc()
|
||||
|
||||
metrics.histogram("entities_created").labels(tenant_id=tenant_id).observe(
|
||||
len(stored_entities)
|
||||
)
|
||||
|
||||
# Publish completion event
|
||||
event_payload = EventPayload(
|
||||
data={
|
||||
"doc_id": doc_id,
|
||||
"tenant_id": tenant_id,
|
||||
"normalization_id": normalization_id,
|
||||
"entity_count": len(stored_entities),
|
||||
"entities": stored_entities,
|
||||
},
|
||||
actor=actor,
|
||||
tenant_id=tenant_id,
|
||||
)
|
||||
|
||||
await event_bus.publish(EventTopics.KG_UPSERTED, event_payload)
|
||||
|
||||
logger.info(
|
||||
"Normalization completed", doc_id=doc_id, entities=len(stored_entities)
|
||||
"Document normalized and mapped", doc_id=doc_id, tenant_id=tenant_id
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Normalization failed", doc_id=doc_id, error=str(e))
|
||||
|
||||
# Update error metrics
|
||||
logger.error(
|
||||
"Failed to normalize and map document", doc_id=doc_id, error=str(e)
|
||||
)
|
||||
metrics.counter("normalization_errors_total").labels(
|
||||
tenant_id=tenant_id, error_type=type(e).__name__
|
||||
).inc()
|
||||
|
||||
|
||||
async def _normalize_data(
|
||||
extracted_fields: dict[str, Any], provenance: list[dict[str, Any]]
|
||||
) -> dict[str, Any]:
|
||||
"""Normalize extracted data"""
|
||||
|
||||
normalized = {}
|
||||
|
||||
for field_name, raw_value in extracted_fields.items():
|
||||
try:
|
||||
if "amount" in field_name.lower() or "total" in field_name.lower():
|
||||
normalized[field_name] = _normalize_amount(raw_value)
|
||||
elif "date" in field_name.lower():
|
||||
normalized[field_name] = _normalize_date(raw_value)
|
||||
elif "name" in field_name.lower():
|
||||
normalized[field_name] = _normalize_name(raw_value)
|
||||
elif "address" in field_name.lower():
|
||||
normalized[field_name] = _normalize_address(raw_value)
|
||||
elif "number" in field_name.lower():
|
||||
normalized[field_name] = _normalize_number(raw_value)
|
||||
else:
|
||||
normalized[field_name] = _normalize_text(raw_value)
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
"Failed to normalize field",
|
||||
field=field_name,
|
||||
value=raw_value,
|
||||
error=str(e),
|
||||
)
|
||||
normalized[field_name] = raw_value # Keep original value
|
||||
|
||||
return normalized
|
||||
|
||||
|
||||
def _normalize_amount(value: str) -> dict[str, Any]:
|
||||
"""Normalize monetary amount"""
|
||||
import re
|
||||
|
||||
if not value:
|
||||
return {"amount": None, "currency": settings.currency_default}
|
||||
|
||||
# Remove currency symbols and formatting
|
||||
clean_value = re.sub(r"[£$€,\s]", "", str(value))
|
||||
|
||||
try:
|
||||
amount = Decimal(clean_value)
|
||||
|
||||
# Validate amount
|
||||
if amount > settings.max_amount:
|
||||
logger.warning("Amount exceeds maximum", amount=amount)
|
||||
|
||||
return {
|
||||
"amount": float(amount),
|
||||
"currency": settings.currency_default,
|
||||
"original": value,
|
||||
}
|
||||
except Exception:
|
||||
return {
|
||||
"amount": None,
|
||||
"currency": settings.currency_default,
|
||||
"original": value,
|
||||
}
|
||||
|
||||
|
||||
def _normalize_date(value: str) -> dict[str, Any]:
|
||||
"""Normalize date"""
|
||||
from dateutil import parser
|
||||
|
||||
if not value:
|
||||
return {"date": None, "original": value}
|
||||
|
||||
try:
|
||||
# Try parsing with dateutil first
|
||||
parsed_date = parser.parse(str(value), dayfirst=True)
|
||||
return {"date": parsed_date.date().isoformat(), "original": value}
|
||||
except Exception:
|
||||
# Try manual formats
|
||||
for fmt in settings.date_formats:
|
||||
async def _normalize_data(extracted_fields: dict[str, Any]) -> dict[str, Any]:
|
||||
"""Normalize extracted data into a consistent format"""
|
||||
normalized_data = {}
|
||||
for key, value in extracted_fields.items():
|
||||
# Example: Simple date normalization (can be expanded)
|
||||
if "date" in key.lower() and isinstance(value, str):
|
||||
try:
|
||||
parsed_date = datetime.strptime(str(value), fmt)
|
||||
return {"date": parsed_date.date().isoformat(), "original": value}
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
return {"date": None, "original": value}
|
||||
# Attempt to parse various date formats
|
||||
# Add more robust date parsing logic here as needed
|
||||
normalized_data[key] = datetime.fromisoformat(value).date().isoformat()
|
||||
except ValueError:
|
||||
normalized_data[key] = value # Keep original if parsing fails
|
||||
elif "amount" in key.lower() and isinstance(value, str):
|
||||
# Example: Normalize currency to a Decimal
|
||||
try:
|
||||
normalized_data[key] = float(value.replace("£", "").replace(",", ""))
|
||||
except ValueError:
|
||||
normalized_data[key] = value
|
||||
else:
|
||||
normalized_data[key] = value
|
||||
return normalized_data
|
||||
|
||||
|
||||
def _normalize_name(value: str) -> dict[str, Any]:
|
||||
"""Normalize person/company name"""
|
||||
if not value:
|
||||
return {"name": None, "original": value}
|
||||
async def _map_to_kg_ontology(
|
||||
doc_id: str,
|
||||
tenant_id: str,
|
||||
normalized_data: dict[str, Any],
|
||||
provenance: list[dict[str, Any]],
|
||||
) -> dict[str, Any]:
|
||||
"""Map normalized data to Knowledge Graph ontology nodes and relationships based on kg_schema.json"""
|
||||
nodes = []
|
||||
relationships = []
|
||||
now = datetime.now(UTC).isoformat()
|
||||
|
||||
# Clean and title case
|
||||
clean_name = str(value).strip().title()
|
||||
# Create a Document node
|
||||
doc_node_id = f"document_{doc_id}"
|
||||
nodes.append(
|
||||
{
|
||||
"id": doc_node_id,
|
||||
"type": "Document",
|
||||
"properties": {
|
||||
"node_type": "Document",
|
||||
"doc_id": doc_id,
|
||||
"kind": normalized_data.get("kind", "OtherSupportingDoc"),
|
||||
"source": normalized_data.get("source", "manual_upload"),
|
||||
"checksum": normalized_data.get("checksum", ""),
|
||||
"valid_from": now,
|
||||
"asserted_at": now,
|
||||
# "source": "svc-normalize-map",
|
||||
"extractor_version": "1.0.0",
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
# Detect if it's a company (contains Ltd, Limited, etc.)
|
||||
company_indicators = ["Ltd", "Limited", "Plc", "Inc", "Corp", "Company"]
|
||||
is_company = any(indicator in clean_name for indicator in company_indicators)
|
||||
# Create a TaxpayerProfile node
|
||||
taxpayer_id = normalized_data.get("taxpayer_id", "unknown_taxpayer")
|
||||
taxpayer_node_id = f"taxpayer_{taxpayer_id}"
|
||||
nodes.append(
|
||||
{
|
||||
"id": taxpayer_node_id,
|
||||
"type": "TaxpayerProfile",
|
||||
"properties": {
|
||||
"node_type": "TaxpayerProfile",
|
||||
"taxpayer_id": taxpayer_id,
|
||||
"type": "Individual",
|
||||
"valid_from": now,
|
||||
"asserted_at": now,
|
||||
"source": "svc-normalize-map",
|
||||
"extractor_version": "1.0.0",
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
relationships.append(
|
||||
{
|
||||
"id": f"rel_document_to_taxpayer_{doc_id}",
|
||||
"type": "BELONGS_TO",
|
||||
"sourceId": doc_node_id,
|
||||
"targetId": taxpayer_node_id,
|
||||
"properties": {},
|
||||
}
|
||||
)
|
||||
|
||||
# Create IncomeItem/ExpenseItem nodes and Evidence nodes
|
||||
item_type = (
|
||||
"IncomeItem" if normalized_data.get("kind") == "invoice" else "ExpenseItem"
|
||||
)
|
||||
|
||||
for field, value in normalized_data.items():
|
||||
if field in ["total_amount", "net_amount", "vat_amount", "amount"]:
|
||||
item_id = f"item_{ulid.new()}"
|
||||
item_node_id = f"{item_type.lower()}_{item_id}"
|
||||
|
||||
# Create the financial item node (IncomeItem or ExpenseItem)
|
||||
nodes.append(
|
||||
{
|
||||
"id": item_node_id,
|
||||
"type": item_type,
|
||||
"properties": {
|
||||
"node_type": item_type,
|
||||
"type": (
|
||||
"self_employment"
|
||||
if "invoice" in normalized_data.get("kind", "")
|
||||
else "other"
|
||||
),
|
||||
"gross": value,
|
||||
"currency": "GBP",
|
||||
"description": normalized_data.get("description", field),
|
||||
"valid_from": now,
|
||||
"asserted_at": now,
|
||||
"source": "svc-normalize-map",
|
||||
"extractor_version": "1.0.0",
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
relationships.append(
|
||||
{
|
||||
"id": f"rel_taxpayer_has_{item_type.lower()}_{item_id}",
|
||||
"type": (
|
||||
"HAS_INCOME" if item_type == "IncomeItem" else "HAS_EXPENSE"
|
||||
),
|
||||
"sourceId": taxpayer_node_id,
|
||||
"targetId": item_node_id,
|
||||
"properties": {},
|
||||
}
|
||||
)
|
||||
|
||||
# Create an Evidence node linking the item to the document
|
||||
prov = next((p for p in provenance if p["field"] == field), None)
|
||||
if prov:
|
||||
evidence_id = f"evidence_{item_id}"
|
||||
nodes.append(
|
||||
{
|
||||
"id": evidence_id,
|
||||
"type": "Evidence",
|
||||
"properties": {
|
||||
"node_type": "Evidence",
|
||||
"snippet_id": evidence_id,
|
||||
"doc_ref": doc_id,
|
||||
"page": prov.get("page"),
|
||||
"bbox": prov.get("bbox"),
|
||||
"text_hash": "dummy_hash", # Placeholder
|
||||
"ocr_confidence": prov.get("confidence"),
|
||||
"extracted_text": str(value),
|
||||
"valid_from": now,
|
||||
"asserted_at": now,
|
||||
"source": "svc-normalize-map",
|
||||
"extractor_version": "1.0.0",
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
relationships.append(
|
||||
{
|
||||
"id": f"rel_item_supported_by_evidence_{item_id}",
|
||||
"type": "SUPPORTED_BY",
|
||||
"sourceId": item_node_id,
|
||||
"targetId": evidence_id,
|
||||
"properties": {},
|
||||
}
|
||||
)
|
||||
|
||||
return {
|
||||
"name": clean_name,
|
||||
"type": "company" if is_company else "person",
|
||||
"original": value,
|
||||
"nodes": nodes,
|
||||
"relationships": relationships,
|
||||
"document_id": doc_id,
|
||||
"tenant_id": tenant_id,
|
||||
}
|
||||
|
||||
|
||||
def _normalize_address(value: str) -> dict[str, Any]:
|
||||
"""Normalize address"""
|
||||
import re
|
||||
|
||||
if not value:
|
||||
return {"address": None, "original": value}
|
||||
|
||||
clean_address = str(value).strip()
|
||||
|
||||
# Extract UK postcode
|
||||
postcode_pattern = r"\b[A-Z]{1,2}\d[A-Z0-9]?\s*\d[A-Z]{2}\b"
|
||||
postcode_match = re.search(postcode_pattern, clean_address, re.IGNORECASE)
|
||||
postcode = postcode_match.group().upper() if postcode_match else None
|
||||
|
||||
return {"address": clean_address, "postcode": postcode, "original": value}
|
||||
|
||||
|
||||
def _normalize_number(value: str) -> dict[str, Any]:
|
||||
"""Normalize reference numbers"""
|
||||
import re
|
||||
|
||||
if not value:
|
||||
return {"number": None, "original": value}
|
||||
|
||||
# Remove spaces and special characters
|
||||
clean_number = re.sub(r"[^\w]", "", str(value))
|
||||
|
||||
# Detect number type
|
||||
number_type = "unknown"
|
||||
if len(clean_number) == 10 and clean_number.isdigit():
|
||||
number_type = "utr" # UTR is 10 digits
|
||||
elif len(clean_number) == 8 and clean_number.isdigit():
|
||||
number_type = "account_number"
|
||||
elif re.match(r"^\d{6}$", clean_number):
|
||||
number_type = "sort_code"
|
||||
|
||||
return {"number": clean_number, "type": number_type, "original": value}
|
||||
|
||||
|
||||
def _normalize_text(value: str) -> dict[str, Any]:
|
||||
"""Normalize general text"""
|
||||
if not value:
|
||||
return {"text": None, "original": value}
|
||||
|
||||
clean_text = str(value).strip()
|
||||
|
||||
return {"text": clean_text, "original": value}
|
||||
|
||||
|
||||
async def _map_to_entities(
|
||||
normalized_data: dict[str, Any], doc_id: str, tenant_id: str
|
||||
) -> list[dict[str, Any]]:
|
||||
"""Map normalized data to knowledge graph entities"""
|
||||
|
||||
entities = []
|
||||
|
||||
# Create document entity
|
||||
doc_entity = {
|
||||
"type": "Document",
|
||||
"id": doc_id,
|
||||
"properties": {
|
||||
"doc_id": doc_id,
|
||||
"tenant_id": tenant_id,
|
||||
"processed_at": datetime.utcnow().isoformat(),
|
||||
"source": "extraction",
|
||||
"extractor_version": "1.0.0",
|
||||
"valid_from": datetime.utcnow(),
|
||||
"asserted_at": datetime.utcnow(),
|
||||
},
|
||||
}
|
||||
entities.append(doc_entity)
|
||||
|
||||
# Map specific field types to entities
|
||||
for field_name, normalized_value in normalized_data.items():
|
||||
if isinstance(normalized_value, dict):
|
||||
if "amount" in normalized_value and normalized_value["amount"] is not None:
|
||||
# Create expense or income item
|
||||
entity_type = (
|
||||
"ExpenseItem" if "expense" in field_name.lower() else "IncomeItem"
|
||||
)
|
||||
entity = {
|
||||
"type": entity_type,
|
||||
"id": f"{entity_type.lower()}_{ulid.new()}",
|
||||
"properties": {
|
||||
"amount": normalized_value["amount"],
|
||||
"currency": normalized_value["currency"],
|
||||
"description": field_name,
|
||||
"source": doc_id,
|
||||
"extractor_version": "1.0.0",
|
||||
"valid_from": datetime.utcnow(),
|
||||
"asserted_at": datetime.utcnow(),
|
||||
},
|
||||
}
|
||||
entities.append(entity)
|
||||
|
||||
elif "name" in normalized_value and normalized_value["name"] is not None:
|
||||
# Create party entity
|
||||
entity = {
|
||||
"type": "Party",
|
||||
"id": f"party_{ulid.new()}",
|
||||
"properties": {
|
||||
"name": normalized_value["name"],
|
||||
"party_type": normalized_value.get("type", "unknown"),
|
||||
"source": doc_id,
|
||||
"extractor_version": "1.0.0",
|
||||
"valid_from": datetime.utcnow(),
|
||||
"asserted_at": datetime.utcnow(),
|
||||
},
|
||||
}
|
||||
entities.append(entity)
|
||||
|
||||
return entities
|
||||
|
||||
|
||||
async def _store_entities(
|
||||
entities: list[dict[str, Any]], tenant_id: str
|
||||
) -> list[dict[str, Any]]:
|
||||
"""Store entities in knowledge graph"""
|
||||
|
||||
stored_entities = []
|
||||
|
||||
for entity in entities:
|
||||
try:
|
||||
# Create node in Neo4j
|
||||
result = await neo4j_client.create_node(
|
||||
label=entity["type"], properties=entity["properties"]
|
||||
)
|
||||
|
||||
stored_entities.append(
|
||||
{
|
||||
"type": entity["type"],
|
||||
"id": entity["id"],
|
||||
"neo4j_id": result.get("id"),
|
||||
"properties": entity["properties"],
|
||||
}
|
||||
)
|
||||
|
||||
logger.debug("Entity stored", type=entity["type"], id=entity["id"])
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Failed to store entity", entity=entity, error=str(e))
|
||||
|
||||
return stored_entities
|
||||
|
||||
|
||||
|
||||
@app.exception_handler(HTTPException)
|
||||
async def http_exception_handler(request: Request, exc: HTTPException) -> JSONResponse:
|
||||
"""Handle HTTP exceptions with RFC7807 format"""
|
||||
@@ -579,8 +330,8 @@ async def http_exception_handler(request: Request, exc: HTTPException) -> JSONRe
|
||||
status=exc.status_code,
|
||||
detail=exc.detail,
|
||||
instance=str(request.url),
|
||||
trace_id="",
|
||||
).dict(),
|
||||
trace_id=getattr(request.state, "trace_id", None),
|
||||
).model_dump(),
|
||||
)
|
||||
|
||||
|
||||
|
||||
@@ -1,37 +1 @@
|
||||
# FastAPI and server
|
||||
fastapi>=0.118.3
|
||||
uvicorn[standard]>=0.37.0
|
||||
pydantic>=2.12.0
|
||||
|
||||
# Service-specific dependencies
|
||||
# Data normalization and cleaning
|
||||
pandas>=2.3.3
|
||||
numpy>=2.3.3
|
||||
|
||||
# Currency and exchange rates
|
||||
forex-python>=1.9.2
|
||||
babel>=2.17.0
|
||||
|
||||
# Date and time processing
|
||||
python-dateutil>=2.9.0
|
||||
pytz>=2025.2
|
||||
|
||||
# Text normalization
|
||||
unidecode>=1.4.0
|
||||
phonenumbers>=9.0.16
|
||||
|
||||
# Entity resolution and matching
|
||||
recordlinkage>=0.16.0
|
||||
fuzzywuzzy>=0.18.0
|
||||
python-Levenshtein>=0.27.1
|
||||
|
||||
# Geographic data
|
||||
geopy>=2.4.1
|
||||
pycountry>=24.6.1
|
||||
|
||||
# Data validation
|
||||
cerberus>=1.3.7
|
||||
marshmallow>=4.0.1
|
||||
|
||||
# UK-specific utilities
|
||||
uk-postcode-utils>=1.1
|
||||
python-ulid
|
||||
|
||||
@@ -7,13 +7,14 @@ import os
|
||||
|
||||
# Import shared libraries
|
||||
import sys
|
||||
from contextlib import asynccontextmanager
|
||||
from datetime import datetime
|
||||
from typing import Any, cast
|
||||
|
||||
import pytesseract
|
||||
import structlog
|
||||
import ulid
|
||||
from fastapi import BackgroundTasks, Depends, HTTPException, Request
|
||||
from fastapi import BackgroundTasks, Depends, FastAPI, HTTPException, Request
|
||||
from fastapi.responses import JSONResponse
|
||||
from pdf2image import convert_from_bytes
|
||||
from PIL import Image
|
||||
@@ -78,6 +79,8 @@ settings: OCRSettings
|
||||
async def init_dependencies(app_settings: OCRSettings) -> None:
|
||||
"""Initialize service dependencies"""
|
||||
global storage_client, document_storage, event_bus, settings, vision_processor
|
||||
# Larger delay to ensure NATS is fully ready before attempting connection
|
||||
await asyncio.sleep(10)
|
||||
|
||||
settings = app_settings
|
||||
logger.info("Starting OCR service")
|
||||
@@ -89,17 +92,35 @@ async def init_dependencies(app_settings: OCRSettings) -> None:
|
||||
minio_client = create_minio_client(settings)
|
||||
storage_client = StorageClient(minio_client)
|
||||
document_storage = DocumentStorage(storage_client)
|
||||
# Initialize event bus
|
||||
event_bus = create_event_bus(settings)
|
||||
if not event_bus:
|
||||
raise HTTPException(status_code=500, detail="Event bus not initialized")
|
||||
|
||||
eb = event_bus
|
||||
# mypy: event_bus is Optional, so use local alias after check
|
||||
await eb.start()
|
||||
|
||||
# Subscribe to document ingestion events
|
||||
await eb.subscribe(EventTopics.DOC_INGESTED, _handle_document_ingested)
|
||||
# Initialize event bus with retry logic
|
||||
max_retries = 20
|
||||
delay = 5
|
||||
for attempt in range(1, max_retries + 1):
|
||||
logger.info(
|
||||
"Attempting NATS connection", url=settings.nats_servers, attempt=attempt
|
||||
)
|
||||
event_bus = create_event_bus(settings)
|
||||
if not event_bus:
|
||||
raise HTTPException(status_code=500, detail="Event bus not initialized")
|
||||
eb = event_bus
|
||||
try:
|
||||
# Attempt to start and subscribe
|
||||
await eb.start()
|
||||
await eb.subscribe(EventTopics.DOC_INGESTED, _handle_document_ingested)
|
||||
logger.info("NATS connection established on attempt", attempt=attempt)
|
||||
break
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
"Failed to connect to NATS, retrying",
|
||||
attempt=attempt,
|
||||
error=str(e),
|
||||
)
|
||||
if attempt == max_retries:
|
||||
raise HTTPException(
|
||||
status_code=500, detail="Failed to connect to NATS after retries"
|
||||
)
|
||||
await asyncio.sleep(delay)
|
||||
delay *= 2 # exponential backoff
|
||||
|
||||
# Initialize shared OCRProcessor for vision strategy
|
||||
try:
|
||||
@@ -114,7 +135,26 @@ async def init_dependencies(app_settings: OCRSettings) -> None:
|
||||
logger.info("OCR service started successfully")
|
||||
|
||||
|
||||
# Create app and settings
|
||||
async def shutdown_dependencies() -> None:
|
||||
"""Shutdown service dependencies"""
|
||||
logger.info("Shutting down OCR service")
|
||||
eb = event_bus
|
||||
if eb is not None:
|
||||
await eb.stop()
|
||||
logger.info("OCR service shutdown complete")
|
||||
|
||||
|
||||
@asynccontextmanager
|
||||
async def lifespan(app: FastAPI): # type: ignore
|
||||
"""FastAPI lifespan event handler"""
|
||||
# Startup
|
||||
await init_dependencies(cast(OCRSettings, _settings))
|
||||
yield
|
||||
# Shutdown
|
||||
await shutdown_dependencies()
|
||||
|
||||
|
||||
# Create app and settings with lifespan
|
||||
app, _settings = create_app(
|
||||
service_name="svc-ocr",
|
||||
title="Tax Agent OCR Service",
|
||||
@@ -122,8 +162,8 @@ app, _settings = create_app(
|
||||
settings_class=OCRSettings,
|
||||
) # fmt: skip
|
||||
|
||||
# Initialize dependencies immediately
|
||||
asyncio.run(init_dependencies(cast(OCRSettings, _settings)))
|
||||
# Override app's lifespan
|
||||
app.router.lifespan_context = lifespan
|
||||
|
||||
tracer = get_tracer("svc-ocr")
|
||||
metrics = get_metrics()
|
||||
|
||||
@@ -14,3 +14,12 @@ opencv-python-headless>=4.12.0.88 # Headless version is smaller
|
||||
|
||||
# Computer vision (torchvision not in base-ml)
|
||||
torchvision>=0.23.0
|
||||
|
||||
# OpenTelemetry (required by libs/observability)
|
||||
opentelemetry-api>=1.21.0
|
||||
opentelemetry-sdk>=1.21.0
|
||||
opentelemetry-exporter-otlp-proto-grpc>=1.21.0
|
||||
opentelemetry-instrumentation-fastapi>=0.42b0
|
||||
opentelemetry-instrumentation-httpx>=0.42b0
|
||||
opentelemetry-instrumentation-psycopg2>=0.42b0
|
||||
opentelemetry-instrumentation-redis>=0.42b0
|
||||
|
||||
@@ -10,12 +10,15 @@ FROM ${REGISTRY}/${OWNER}/base-ml:${BASE_VERSION}
|
||||
# Switch to root to install service-specific dependencies
|
||||
USER root
|
||||
|
||||
RUN apt-get update && apt-get install -y build-essential
|
||||
|
||||
# Set working directory
|
||||
WORKDIR /app
|
||||
|
||||
# Copy service-specific requirements and install
|
||||
COPY libs/requirements-base.txt /tmp/libs-requirements.txt
|
||||
COPY apps/svc_rag_indexer/requirements.txt /tmp/service-requirements.txt
|
||||
RUN pip install --no-cache-dir -r /tmp/service-requirements.txt
|
||||
RUN pip install --no-cache-dir -r /tmp/libs-requirements.txt -r /tmp/service-requirements.txt
|
||||
|
||||
# Copy application code
|
||||
COPY libs/ ./libs/
|
||||
@@ -26,7 +29,7 @@ RUN chown -R appuser:appuser /app
|
||||
USER appuser
|
||||
|
||||
# Health check
|
||||
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
|
||||
HEALTHCHECK --interval=30s --timeout=10s --start-period=30s --retries=3 \
|
||||
CMD curl -f http://localhost:8000/healthz || exit 1
|
||||
|
||||
# Expose port
|
||||
|
||||
@@ -10,12 +10,15 @@ FROM ${REGISTRY}/${OWNER}/base-ml:${BASE_VERSION}
|
||||
# Switch to root to install service-specific dependencies
|
||||
USER root
|
||||
|
||||
RUN apt-get update && apt-get install -y build-essential
|
||||
|
||||
# Set working directory
|
||||
WORKDIR /app
|
||||
|
||||
# Copy service-specific requirements and install
|
||||
COPY libs/requirements-base.txt /tmp/libs-requirements.txt
|
||||
COPY apps/svc_rag_retriever/requirements.txt /tmp/service-requirements.txt
|
||||
RUN pip install --no-cache-dir -r /tmp/service-requirements.txt
|
||||
RUN pip install --no-cache-dir -r /tmp/libs-requirements.txt -r /tmp/service-requirements.txt
|
||||
|
||||
# Copy application code
|
||||
COPY libs/ ./libs/
|
||||
|
||||
@@ -43,7 +43,7 @@ RUN chown -R appuser:appuser /app
|
||||
USER appuser
|
||||
|
||||
# Health check
|
||||
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
|
||||
HEALTHCHECK --interval=30s --timeout=10s --start-period=30s --retries=3 \
|
||||
CMD curl -f http://localhost:8000/healthz || exit 1
|
||||
|
||||
# Expose port
|
||||
|
||||
@@ -17,6 +17,7 @@ from datetime import datetime
|
||||
from decimal import Decimal
|
||||
from typing import Any
|
||||
|
||||
import httpx
|
||||
import structlog
|
||||
import ulid
|
||||
from fastapi import BackgroundTasks, Depends, HTTPException, Request
|
||||
@@ -55,6 +56,9 @@ class ReasonSettings(BaseAppSettings):
|
||||
max_income: float = 10000000.0 # £10M
|
||||
max_expenses: float = 10000000.0 # £10M
|
||||
|
||||
# External services
|
||||
coverage_service_url: str = "http://svc-coverage:8000"
|
||||
|
||||
|
||||
# Create app and settings
|
||||
app, settings = create_app(
|
||||
@@ -67,6 +71,7 @@ app, settings = create_app(
|
||||
# Global clients
|
||||
neo4j_client: Neo4jClient | None = None
|
||||
event_bus: EventBus | None = None
|
||||
http_client: httpx.AsyncClient | None = None
|
||||
tracer = get_tracer("svc-reason")
|
||||
metrics = get_metrics()
|
||||
|
||||
@@ -74,7 +79,7 @@ metrics = get_metrics()
|
||||
@app.on_event("startup")
|
||||
async def startup_event() -> None:
|
||||
"""Initialize service dependencies"""
|
||||
global neo4j_client, event_bus
|
||||
global neo4j_client, event_bus, http_client
|
||||
|
||||
logger.info("Starting reasoning service")
|
||||
|
||||
@@ -89,6 +94,9 @@ async def startup_event() -> None:
|
||||
event_bus = create_event_bus(settings)
|
||||
await event_bus.start() # fmt: skip# pyright: ignore[reportOptionalMemberAccess]
|
||||
|
||||
# Initialize HTTP client
|
||||
http_client = httpx.AsyncClient()
|
||||
|
||||
# Subscribe to KG upsert events
|
||||
await event_bus.subscribe(EventTopics.KG_UPSERTED, _handle_kg_upserted) # type: ignore
|
||||
|
||||
@@ -98,7 +106,7 @@ async def startup_event() -> None:
|
||||
@app.on_event("shutdown")
|
||||
async def shutdown_event() -> None:
|
||||
"""Cleanup service dependencies"""
|
||||
global neo4j_client, event_bus
|
||||
global neo4j_client, event_bus, http_client
|
||||
|
||||
logger.info("Shutting down reasoning service")
|
||||
|
||||
@@ -108,6 +116,9 @@ async def shutdown_event() -> None:
|
||||
if event_bus:
|
||||
await event_bus.stop()
|
||||
|
||||
if http_client:
|
||||
await http_client.aclose()
|
||||
|
||||
logger.info("Reasoning service shutdown complete")
|
||||
|
||||
|
||||
@@ -259,41 +270,76 @@ async def get_calculation_results(
|
||||
|
||||
|
||||
async def _handle_kg_upserted(topic: str, payload: EventPayload) -> None:
|
||||
"""Handle KG upsert events for auto-calculation"""
|
||||
"""Handle KG upsert events for auto-calculation and coverage check"""
|
||||
data = payload.data
|
||||
taxpayer_id = data.get("taxpayer_id")
|
||||
tax_year = data.get("tax_year")
|
||||
tenant_id = data.get("tenant_id")
|
||||
|
||||
if not taxpayer_id or not tax_year or not tenant_id:
|
||||
logger.warning("Invalid KG upsert event data for coverage check", data=data)
|
||||
return
|
||||
|
||||
# Trigger svc-coverage check
|
||||
try:
|
||||
data = payload.data
|
||||
entities = data.get("entities", [])
|
||||
tenant_id = data.get("tenant_id")
|
||||
|
||||
# Check if we have enough data for calculation
|
||||
has_income = any(e.get("type") == "IncomeItem" for e in entities)
|
||||
has_expenses = any(e.get("type") == "ExpenseItem" for e in entities)
|
||||
|
||||
if has_income or has_expenses:
|
||||
if http_client:
|
||||
coverage_url = f"{settings.coverage_service_url}/v1/coverage/check"
|
||||
request_body = {
|
||||
"tax_year": tax_year,
|
||||
"taxpayer_id": taxpayer_id,
|
||||
}
|
||||
headers = {
|
||||
"X-Tenant-ID": tenant_id,
|
||||
# Assuming current_user is not directly available here,
|
||||
# or a system user token needs to be generated.
|
||||
# For now, omitting X-Authenticated-User for simplicity,
|
||||
# but in a real system, this should be handled securely.
|
||||
}
|
||||
response = await http_client.post(coverage_url, json=request_body, headers=headers)
|
||||
response.raise_for_status()
|
||||
coverage_report = response.json()
|
||||
logger.info(
|
||||
"Auto-triggering calculation due to new financial data",
|
||||
tenant_id=tenant_id,
|
||||
"Triggered svc-coverage check",
|
||||
taxpayer_id=taxpayer_id,
|
||||
tax_year=tax_year,
|
||||
coverage_status=coverage_report.get("overall_status"),
|
||||
)
|
||||
|
||||
# Find taxpayer ID from entities
|
||||
taxpayer_id = None
|
||||
for entity in entities:
|
||||
if entity.get("type") == "TaxpayerProfile":
|
||||
taxpayer_id = entity.get("id")
|
||||
break
|
||||
|
||||
if taxpayer_id:
|
||||
# If coverage is complete, trigger calculation
|
||||
if coverage_report.get("overall_status") == "complete":
|
||||
logger.info(
|
||||
"Coverage complete, auto-triggering calculation",
|
||||
taxpayer_id=taxpayer_id,
|
||||
tax_year=tax_year,
|
||||
)
|
||||
await _compute_schedule_async(
|
||||
tax_year=settings.current_tax_year,
|
||||
tax_year=tax_year,
|
||||
taxpayer_id=taxpayer_id,
|
||||
schedule_id="SA103", # Default to self-employment
|
||||
tenant_id=tenant_id or "",
|
||||
tenant_id=tenant_id,
|
||||
calculation_id=str(ulid.new()),
|
||||
actor=payload.actor,
|
||||
)
|
||||
else:
|
||||
logger.info(
|
||||
"Coverage incomplete, not triggering calculation",
|
||||
taxpayer_id=taxpayer_id,
|
||||
tax_year=tax_year,
|
||||
blocking_items=coverage_report.get("blocking_items"),
|
||||
)
|
||||
|
||||
|
||||
except httpx.HTTPStatusError as e:
|
||||
logger.error(
|
||||
"Failed to trigger svc-coverage check due to HTTP error",
|
||||
taxpayer_id=taxpayer_id,
|
||||
tax_year=tax_year,
|
||||
error=str(e),
|
||||
response_status_code=e.response.status_code,
|
||||
response_text=e.response.text,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error("Failed to handle KG upsert for auto-calculation", error=str(e))
|
||||
logger.error("Failed to handle KG upsert for auto-calculation or coverage check", error=str(e))
|
||||
|
||||
|
||||
async def _compute_schedule_async(
|
||||
@@ -570,16 +616,107 @@ async def _compute_sa105(
|
||||
async def _compute_sa100(
|
||||
financial_data: dict[str, Any], tax_year: str
|
||||
) -> tuple[dict[str, Any], list[dict[str, Any]]]:
|
||||
"""Compute SA100 (Main return) schedule"""
|
||||
|
||||
# This would aggregate from other schedules
|
||||
# For now, return basic structure
|
||||
form_boxes = {
|
||||
"1": {"value": "John Doe", "description": "Your name", "confidence": 0.9}
|
||||
}
|
||||
"""Compute SA100 (Main return) schedule by aggregating other schedules"""
|
||||
|
||||
form_boxes = {}
|
||||
evidence_trail: list[dict[str, Any]] = []
|
||||
|
||||
taxpayer_id = financial_data.get("taxpayer_id")
|
||||
tenant_id = financial_data.get("tenant_id") # Assuming tenant_id is passed in financial_data
|
||||
|
||||
if not taxpayer_id or not tenant_id:
|
||||
raise ValueError("Taxpayer ID or Tenant ID missing for SA100 computation")
|
||||
|
||||
# Get latest SA103 calculation
|
||||
sa103_query = """
|
||||
MATCH (t:TaxpayerProfile {taxpayer_id: $taxpayer_id, tenant_id: $tenant_id})-[:HAS_CALCULATION]->(c:Calculation)
|
||||
WHERE c.schedule = 'SA103' AND c.tax_year = $tax_year AND c.retracted_at IS NULL
|
||||
OPTIONAL MATCH (c)-[:HAS_BOX]->(b:FormBox)
|
||||
RETURN c.calculation_id AS calculation_id, c.calculated_at AS calculated_at, COLLECT({box: b.box, value: b.value, description: b.description, confidence: b.confidence}) AS form_boxes
|
||||
ORDER BY c.calculated_at DESC
|
||||
LIMIT 1
|
||||
"""
|
||||
sa103_results = await neo4j_client.run_query( # type: ignore
|
||||
sa103_query, {"taxpayer_id": taxpayer_id, "tenant_id": tenant_id, "tax_year": tax_year}
|
||||
)
|
||||
sa103_calc = sa103_results[0] if sa103_results else None
|
||||
|
||||
sa103_net_profit = Decimal("0")
|
||||
if sa103_calc and sa103_calc["form_boxes"]:
|
||||
for box in sa103_calc["form_boxes"]:
|
||||
if box["box"] == "32": # Net profit box in SA103
|
||||
sa103_net_profit = Decimal(str(box["value"]))
|
||||
form_boxes["SA103_32"] = {"value": float(sa103_net_profit), "description": "SA103 Net Profit", "confidence": box.get("confidence", 0.9)}
|
||||
evidence_trail.append({
|
||||
"box": "SA103_32",
|
||||
"source_calculation_id": sa103_calc["calculation_id"],
|
||||
"description": "Derived from SA103 Net Profit"
|
||||
})
|
||||
break
|
||||
|
||||
# Get latest SA105 calculation
|
||||
sa105_query = """
|
||||
MATCH (t:TaxpayerProfile {taxpayer_id: $taxpayer_id, tenant_id: $tenant_id})-[:HAS_CALCULATION]->(c:Calculation)
|
||||
WHERE c.schedule = 'SA105' AND c.tax_year = $tax_year AND c.retracted_at IS NULL
|
||||
OPTIONAL MATCH (c)-[:HAS_BOX]->(b:FormBox)
|
||||
RETURN c.calculation_id AS calculation_id, c.calculated_at AS calculated_at, COLLECT({box: b.box, value: b.value, description: b.description, confidence: b.confidence}) AS form_boxes
|
||||
ORDER BY c.calculated_at DESC
|
||||
LIMIT 1
|
||||
"""
|
||||
sa105_results = await neo4j_client.run_query( # type: ignore
|
||||
sa105_query, {"taxpayer_id": taxpayer_id, "tenant_id": tenant_id, "tax_year": tax_year}
|
||||
)
|
||||
sa105_calc = sa105_results[0] if sa105_results else None
|
||||
|
||||
sa105_net_income = Decimal("0")
|
||||
if sa105_calc and sa105_calc["form_boxes"]:
|
||||
for box in sa105_calc["form_boxes"]:
|
||||
if box["box"] == "net_income": # Net property income box in SA105 (custom box for internal calculation)
|
||||
sa105_net_income = Decimal(str(box["value"]))
|
||||
form_boxes["SA105_net_income"] = {"value": float(sa105_net_income), "description": "SA105 Net Property Income", "confidence": box.get("confidence", 0.9)}
|
||||
evidence_trail.append({
|
||||
"box": "SA105_net_income",
|
||||
"source_calculation_id": sa105_calc["calculation_id"],
|
||||
"description": "Derived from SA105 Net Property Income"
|
||||
})
|
||||
break
|
||||
|
||||
# Aggregate total income for SA100
|
||||
total_income = sa103_net_profit + sa105_net_income
|
||||
form_boxes["SA100_total_income"] = {
|
||||
"value": float(total_income),
|
||||
"description": "Total income from all sources",
|
||||
"confidence": 0.95 # Higher confidence for aggregated value
|
||||
}
|
||||
evidence_trail.append({
|
||||
"box": "SA100_total_income",
|
||||
"derived_from": ["SA103_32", "SA105_net_income"],
|
||||
"description": "Aggregated from SA103 net profit and SA105 net property income"
|
||||
})
|
||||
|
||||
# Example: Basic personal allowance (simplified)
|
||||
personal_allowance = Decimal("12570") # For 2023-24
|
||||
if total_income > Decimal("100000"): # Tapering not implemented here
|
||||
personal_allowance = Decimal("0")
|
||||
|
||||
form_boxes["SA100_personal_allowance"] = {
|
||||
"value": float(personal_allowance),
|
||||
"description": "Personal Allowance",
|
||||
"confidence": 0.99
|
||||
}
|
||||
evidence_trail.append({
|
||||
"box": "SA100_personal_allowance",
|
||||
"source": "HMRC_guidance",
|
||||
"description": f"Standard personal allowance for {tax_year}"
|
||||
})
|
||||
|
||||
|
||||
# Placeholder for actual SA100 boxes and complex calculations
|
||||
# This would involve detailed tax band calculations, reliefs, etc.
|
||||
# For now, we'll just show the aggregation.
|
||||
form_boxes["1"] = {"value": "John Doe (Aggregated)", "description": "Your name", "confidence": 0.9}
|
||||
|
||||
|
||||
return form_boxes, evidence_trail
|
||||
|
||||
|
||||
|
||||
@@ -33,3 +33,4 @@ jinja2>=3.1.6
|
||||
|
||||
# Statistical calculations
|
||||
scipy>=1.16.2
|
||||
httpx
|
||||
|
||||
Reference in New Issue
Block a user