completed local setup with compose
Some checks failed
CI/CD Pipeline / Generate SBOM (push) Has been cancelled
CI/CD Pipeline / Deploy to Staging (push) Has been cancelled
CI/CD Pipeline / Deploy to Production (push) Has been cancelled
CI/CD Pipeline / Code Quality & Linting (push) Has been cancelled
CI/CD Pipeline / Policy Validation (push) Has been cancelled
CI/CD Pipeline / Test Suite (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-firm-connectors) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-forms) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-hmrc) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ingestion) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-normalize-map) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ocr) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-indexer) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-reason) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rpa) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (ui-review) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (ui-review) (push) Has been cancelled
CI/CD Pipeline / Notifications (push) Has been cancelled

This commit is contained in:
harkon
2025-11-26 13:17:17 +00:00
parent 8fe5e62fee
commit fdba81809f
87 changed files with 5610 additions and 3376 deletions

View File

@@ -13,9 +13,10 @@ ENV PATH="/opt/venv/bin:$PATH"
# Copy requirements and install dependencies
COPY libs/requirements-base.txt /tmp/libs-requirements.txt
COPY libs/requirements-ml.txt /tmp/libs-ml-requirements.txt
COPY apps/svc_extract/requirements.txt /tmp/requirements.txt
RUN pip install --no-cache-dir --upgrade pip && \
pip install --no-cache-dir -r /tmp/libs-requirements.txt -r /tmp/requirements.txt
pip install --no-cache-dir -r /tmp/libs-requirements.txt -r /tmp/libs-ml-requirements.txt -r /tmp/requirements.txt
# Production stage
FROM python:3.12-slim

View File

@@ -43,7 +43,7 @@ RUN chown -R appuser:appuser /app
USER appuser
# Health check
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
HEALTHCHECK --interval=30s --timeout=10s --start-period=30s --retries=3 \
CMD curl -f http://localhost:8000/healthz || exit 1
# Expose port

View File

@@ -44,7 +44,7 @@ RUN chown -R appuser:appuser /app
USER appuser
# Health check
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
HEALTHCHECK --interval=30s --timeout=10s --start-period=30s --retries=3 \
CMD curl -f http://localhost:8000/healthz || exit 1
# Expose port

View File

@@ -158,13 +158,13 @@ async def upload_document(
event_payload = EventPayload(
data={
"doc_id": doc_id,
"tenant_id": tenant_id,
"filename": file.filename or "unknown",
"kind": kind.value,
"source": source,
"checksum": checksum,
"file_size": len(content),
"content_type": content_type,
"s3_url": storage_result["s3_url"],
"checksum_sha256": checksum,
"size_bytes": len(content),
"mime_type": content_type,
"storage_path": storage_result["s3_url"],
},
actor=current_user.get("sub", "system"),
tenant_id=tenant_id,

View File

@@ -1,54 +1,27 @@
# Multi-stage build for svc_kg
FROM python:3.12-slim AS builder
FROM python:3.12-slim-bookworm
# Install build dependencies
RUN apt-get update && apt-get install -y \
build-essential \
curl \
&& rm -rf /var/lib/apt/lists/*
# Set environment variables
ENV PYTHONUNBUFFERED 1
ENV APP_HOME /app
# Create virtual environment
RUN python -m venv /opt/venv
ENV PATH="/opt/venv/bin:$PATH"
# Create and set working directory
WORKDIR $APP_HOME
# Copy requirements and install dependencies
# Install dependencies
COPY libs/requirements-base.txt /tmp/libs-requirements.txt
COPY libs/requirements-rdf.txt /tmp/libs-rdf.txt
COPY apps/svc_kg/requirements.txt /tmp/requirements.txt
RUN pip install --no-cache-dir --upgrade pip && \
pip install --no-cache-dir -r /tmp/libs-requirements.txt -r /tmp/libs-rdf.txt -r /tmp/requirements.txt
# Production stage
FROM python:3.12-slim
# Install runtime dependencies
RUN apt-get update && apt-get install -y \
curl \
&& rm -rf /var/lib/apt/lists/* \
&& groupadd -r appuser \
&& useradd -r -g appuser appuser
# Copy virtual environment from builder
COPY --from=builder /opt/venv /opt/venv
ENV PATH="/opt/venv/bin:$PATH"
# Set working directory
WORKDIR /app
RUN pip install --no-cache-dir -r /tmp/libs-requirements.txt -r /tmp/requirements.txt
# Copy application code
COPY libs/ ./libs/
COPY apps/svc_kg/ ./apps/svc_kg/
# Create non-root user and set permissions
RUN chown -R appuser:appuser /app
USER appuser
# Health check
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
CMD curl -f http://localhost:8000/healthz || exit 1
# Expose port
EXPOSE 8000
# Run the application
CMD ["python", "-m", "uvicorn", "apps.svc_kg.main:app", "--host", "0.0.0.0", "--port", "8000"]

View File

@@ -1,28 +1,22 @@
# FILE: apps/svc-kg/main.py
# Knowledge graph facade with CRUD, queries, lineage, and SHACL validation
import json
import os
# Import shared libraries
import sys
from datetime import datetime
from typing import Any
from typing import Any, cast
import structlog
from fastapi import Depends, HTTPException, Query, Request
from fastapi import HTTPException, Request
from fastapi.responses import JSONResponse
from pyshacl import validate
from rdflib import Graph, Literal, URIRef
from rdflib.namespace import RDF
sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
from libs.app_factory import create_app
from libs.config import BaseAppSettings, create_event_bus, create_neo4j_client
from libs.events import EventBus
from libs.neo import Neo4jClient, SHACLValidator, TemporalQueries
from libs.events import EventBus, EventPayload, EventTopics
from libs.neo import Neo4jClient
from libs.observability import get_metrics, get_tracer, setup_observability
from libs.schemas import ErrorResponse
from libs.security import get_current_user, get_tenant_id
logger = structlog.get_logger()
@@ -31,523 +25,193 @@ class KGSettings(BaseAppSettings):
"""Settings for KG service"""
service_name: str = "svc-kg"
shacl_shapes_path: str = "schemas/shapes.ttl"
# SHACL validation
shapes_file: str = "schemas/shapes.ttl"
validate_on_write: bool = True
# Query limits
max_results: int = 1000
max_depth: int = 10
query_timeout: int = 30
# Create app and settings
app, settings = create_app(
service_name="svc-kg",
title="Tax Agent Knowledge Graph Service",
description="Knowledge graph facade with CRUD and queries",
settings_class=KGSettings,
)
# Global clients
neo4j_client: Neo4jClient | None = None
shacl_validator: SHACLValidator | None = None
event_bus: EventBus | None = None
tracer = get_tracer("svc-kg")
metrics = get_metrics()
shapes_graph: Graph | None = None
settings: KGSettings
@app.on_event("startup")
async def startup_event() -> None:
async def init_dependencies(app_settings: KGSettings) -> None:
"""Initialize service dependencies"""
global neo4j_client, shacl_validator, event_bus
global neo4j_client, event_bus, settings, shapes_graph
settings = app_settings
logger.info("Starting KG service")
# Setup observability
setup_observability(settings)
# Initialize Neo4j client
neo4j_driver = create_neo4j_client(settings)
neo4j_client = Neo4jClient(neo4j_driver)
# Initialize SHACL validator
if os.path.exists(settings.shapes_file):
shacl_validator = SHACLValidator(settings.shapes_file)
# Initialize event bus
event_bus = create_event_bus(settings)
if not event_bus:
raise HTTPException(status_code=500, detail="Event bus not initialized")
await event_bus.start()
logger.info("KG service started successfully")
await event_bus.subscribe(EventTopics.KG_UPSERT_READY, _handle_kg_upsert_ready)
# Load SHACL shapes
try:
shapes_graph = Graph().parse(settings.shacl_shapes_path, format="turtle")
logger.info("SHACL shapes loaded successfully")
except Exception as e:
logger.error("Failed to load SHACL shapes", error=str(e))
shapes_graph = None
app, _settings = create_app(
service_name="svc-kg",
title="Tax Agent Knowledge Graph Service",
description="Service for managing and validating the Knowledge Graph",
settings_class=KGSettings,
)
# Initialize dependencies immediately
@app.on_event("startup")
async def startup_event():
await init_dependencies(cast(KGSettings, _settings))
tracer = get_tracer("svc-kg")
metrics = get_metrics()
@app.on_event("shutdown")
async def shutdown_event() -> None:
"""Cleanup service dependencies"""
global neo4j_client, event_bus
global event_bus, neo4j_client
logger.info("Shutting down KG service")
if neo4j_client:
await neo4j_client.close()
if event_bus:
await event_bus.stop()
if neo4j_client:
await neo4j_client.close()
logger.info("KG service shutdown complete")
@app.get("/health")
async def health_check() -> dict[str, Any]:
"""Health check endpoint"""
return {
"status": "healthy",
"service": settings.service_name,
"version": settings.service_version,
"timestamp": datetime.utcnow().isoformat(),
}
async def _handle_kg_upsert_ready(topic: str, payload: EventPayload) -> None:
"""Handle KG upsert ready events"""
data = payload.data
nodes = data.get("nodes", [])
relationships = data.get("relationships", [])
document_id = data.get("document_id")
tenant_id = data.get("tenant_id")
if not nodes and not relationships:
logger.warning("No nodes or relationships to upsert", data=data)
return
@app.post("/nodes/{label}")
async def create_node(
label: str,
properties: dict[str, Any],
current_user: dict[str, Any] = Depends(get_current_user),
tenant_id: str = Depends(get_tenant_id),
) -> dict[str, Any]:
"""Create a new node"""
with tracer.start_as_current_span("create_node") as span:
span.set_attribute("label", label)
with tracer.start_as_current_span("upsert_kg_data") as span:
span.set_attribute("document_id", document_id)
span.set_attribute("tenant_id", tenant_id)
span.set_attribute("node_count", len(nodes))
span.set_attribute("relationship_count", len(relationships))
try:
# Add tenant isolation
properties["tenant_id"] = tenant_id
properties["created_by"] = current_user.get("sub", "system")
# Validate with SHACL if enabled
if settings.validate_on_write and shacl_validator:
await _validate_node(label, properties)
# Create node
result = await neo4j_client.create_node(label, properties)
# Update metrics
metrics.counter("nodes_created_total").labels(
tenant_id=tenant_id, label=label
).inc()
logger.info("Node created", label=label, node_id=result.get("id"))
return {
"status": "created",
"label": label,
"properties": properties,
"neo4j_result": result,
}
except Exception as e:
logger.error("Failed to create node", label=label, error=str(e))
raise HTTPException(
status_code=500, detail=f"Failed to create node: {str(e)}"
# 1. Validate data against SHACL schema
conforms, validation_report = await _validate_with_shacl(
nodes, relationships
)
if not conforms:
logger.error(
"SHACL validation failed",
document_id=document_id,
validation_report=validation_report,
)
metrics.counter("kg_validation_errors_total").labels(
tenant_id=tenant_id
).inc()
return
# 2. Write data to Neo4j
for node in nodes:
await neo4j_client.create_node(node["type"], node["properties"]) # type: ignore
@app.get("/nodes/{label}")
async def get_nodes(
label: str,
limit: int = Query(default=100, le=settings.max_results),
filters: str | None = Query(default=None),
current_user: dict[str, Any] = Depends(get_current_user),
tenant_id: str = Depends(get_tenant_id),
) -> dict[str, Any]:
"""Get nodes by label with optional filters"""
for rel in relationships:
await neo4j_client.create_relationship( # type: ignore
rel["sourceId"],
rel["targetId"],
rel["type"],
rel["properties"],
)
with tracer.start_as_current_span("get_nodes") as span:
span.set_attribute("label", label)
span.set_attribute("tenant_id", tenant_id)
span.set_attribute("limit", limit)
try:
# Parse filters
filter_dict: dict[str, Any] = {}
if filters:
try:
filter_dict = json.loads(filters)
except json.JSONDecodeError:
raise HTTPException(status_code=400, detail="Invalid filters JSON")
# Add tenant isolation
filter_dict["tenant_id"] = tenant_id
# Build query
query = TemporalQueries.get_current_state_query(label, filter_dict)
query += f" LIMIT {limit}"
# Execute query
results = await neo4j_client.run_query(query)
# Update metrics
metrics.counter("nodes_queried_total").labels(
tenant_id=tenant_id, label=label
).inc()
return {
"label": label,
"count": len(results),
"nodes": [result["n"] for result in results],
}
except HTTPException:
raise
except Exception as e:
logger.error("Failed to get nodes", label=label, error=str(e))
raise HTTPException(
status_code=500, detail=f"Failed to get nodes: {str(e)}"
# 3. Publish kg.upserted event
event_payload = EventPayload(
data={
"document_id": document_id,
"tenant_id": tenant_id,
"taxpayer_id": data.get("taxpayer_id"),
"tax_year": data.get("tax_year"),
"node_count": len(nodes),
"relationship_count": len(relationships),
},
actor=payload.actor,
tenant_id=tenant_id,
trace_id=str(span.get_span_context().trace_id),
)
await event_bus.publish(EventTopics.KG_UPSERTED, event_payload) # type: ignore
@app.get("/nodes/{label}/{node_id}")
async def get_node(
label: str,
node_id: str,
include_lineage: bool = Query(default=False),
current_user: dict[str, Any] = Depends(get_current_user),
tenant_id: str = Depends(get_tenant_id),
) -> dict[str, Any]:
"""Get specific node with optional lineage"""
with tracer.start_as_current_span("get_node") as span:
span.set_attribute("label", label)
span.set_attribute("node_id", node_id)
span.set_attribute("tenant_id", tenant_id)
try:
# Get node
query = f"""
MATCH (n:{label} {{id: $node_id, tenant_id: $tenant_id}})
WHERE n.retracted_at IS NULL
RETURN n
"""
results = await neo4j_client.run_query(
query, {"node_id": node_id, "tenant_id": tenant_id}
)
if not results:
raise HTTPException(status_code=404, detail="Node not found")
node_data = results[0]["n"]
# Get lineage if requested
lineage: list[dict[str, Any]] = []
if include_lineage:
lineage = await neo4j_client.get_node_lineage(node_id)
return {"node": node_data, "lineage": lineage if include_lineage else None}
except HTTPException:
raise
except Exception as e:
logger.error(
"Failed to get node", label=label, node_id=node_id, error=str(e)
)
raise HTTPException(status_code=500, detail=f"Failed to get node: {str(e)}")
@app.put("/nodes/{label}/{node_id}")
async def update_node(
label: str,
node_id: str,
properties: dict[str, Any],
current_user: dict[str, Any] = Depends(get_current_user),
tenant_id: str = Depends(get_tenant_id),
) -> dict[str, Any]:
"""Update node with bitemporal versioning"""
with tracer.start_as_current_span("update_node") as span:
span.set_attribute("label", label)
span.set_attribute("node_id", node_id)
span.set_attribute("tenant_id", tenant_id)
try:
# Add metadata
properties["tenant_id"] = tenant_id
properties["updated_by"] = current_user.get("sub", "system")
# Validate with SHACL if enabled
if settings.validate_on_write and shacl_validator:
await _validate_node(label, properties)
# Update node (creates new version)
await neo4j_client.update_node(label, node_id, properties)
# Update metrics
metrics.counter("nodes_updated_total").labels(
tenant_id=tenant_id, label=label
).inc()
logger.info("Node updated", label=label, node_id=node_id)
return {
"status": "updated",
"label": label,
"node_id": node_id,
"properties": properties,
}
except Exception as e:
logger.error(
"Failed to update node", label=label, node_id=node_id, error=str(e)
)
raise HTTPException(
status_code=500, detail=f"Failed to update node: {str(e)}"
)
@app.post("/relationships")
async def create_relationship(
from_label: str,
from_id: str,
to_label: str,
to_id: str,
relationship_type: str,
properties: dict[str, Any] | None = None,
current_user: dict[str, Any] = Depends(get_current_user),
tenant_id: str = Depends(get_tenant_id),
) -> dict[str, Any]:
"""Create relationship between nodes"""
with tracer.start_as_current_span("create_relationship") as span:
span.set_attribute("from_label", from_label)
span.set_attribute("to_label", to_label)
span.set_attribute("relationship_type", relationship_type)
span.set_attribute("tenant_id", tenant_id)
try:
# Add metadata
rel_properties = properties or {}
rel_properties["tenant_id"] = tenant_id
rel_properties["created_by"] = current_user.get("sub", "system")
# Create relationship
await neo4j_client.create_relationship(
from_label, from_id, to_label, to_id, relationship_type, rel_properties
)
# Update metrics
metrics.counter("relationships_created_total").labels(
tenant_id=tenant_id, relationship_type=relationship_type
).inc()
metrics.counter("kg_upserts_total").labels(tenant_id=tenant_id).inc()
logger.info(
"Relationship created",
from_id=from_id,
to_id=to_id,
type=relationship_type,
"KG upsert completed", document_id=document_id, tenant_id=tenant_id
)
return {
"status": "created",
"from_id": from_id,
"to_id": to_id,
"relationship_type": relationship_type,
"properties": rel_properties,
}
except Exception as e:
logger.error("Failed to create relationship", error=str(e))
raise HTTPException(
status_code=500, detail=f"Failed to create relationship: {str(e)}"
logger.error(
"Failed to upsert KG data", document_id=document_id, error=str(e)
)
@app.post("/query")
async def execute_query(
query: str,
parameters: dict[str, Any] | None = None,
current_user: dict[str, Any] = Depends(get_current_user),
tenant_id: str = Depends(get_tenant_id),
) -> dict[str, Any]:
"""Execute custom Cypher query with tenant isolation"""
with tracer.start_as_current_span("execute_query") as span:
span.set_attribute("tenant_id", tenant_id)
try:
# Add tenant isolation to parameters
query_params = parameters or {}
query_params["tenant_id"] = tenant_id
# Validate query (basic security check)
if not _is_safe_query(query):
raise HTTPException(status_code=400, detail="Unsafe query detected")
# Execute query with timeout
results = await neo4j_client.run_query(query, query_params, max_retries=1)
# Update metrics
metrics.counter("custom_queries_total").labels(tenant_id=tenant_id).inc()
return {
"query": query,
"parameters": query_params,
"results": results,
"count": len(results),
}
except Exception as e:
logger.error("Query execution failed", query=query[:100], error=str(e))
raise HTTPException(status_code=500, detail=f"Query failed: {str(e)}")
@app.get("/export/rdf")
async def export_rdf(
format: str = Query(default="turtle"),
current_user: dict[str, Any] = Depends(get_current_user),
tenant_id: str = Depends(get_tenant_id),
) -> dict[str, Any]:
"""Export knowledge graph as RDF"""
with tracer.start_as_current_span("export_rdf") as span:
span.set_attribute("format", format)
span.set_attribute("tenant_id", tenant_id)
try:
# Export tenant-specific data
rdf_data = await neo4j_client.export_to_rdf(format)
# Update metrics
metrics.counter("rdf_exports_total").labels(
tenant_id=tenant_id, format=format
metrics.counter("kg_upsert_errors_total").labels(
tenant_id=tenant_id, error_type=type(e).__name__
).inc()
return {
"format": format,
"rdf_data": rdf_data,
"exported_at": datetime.utcnow().isoformat(),
}
except Exception as e:
logger.error("RDF export failed", format=format, error=str(e))
raise HTTPException(
status_code=500, detail=f"RDF export failed: {str(e)}"
) from e
async def _validate_with_shacl(
nodes: list[dict[str, Any]], relationships: list[dict[str, Any]]
) -> tuple[bool, str]:
"""Validate data against SHACL shapes."""
if not shapes_graph:
logger.warning("SHACL shapes not loaded, skipping validation.")
return True, "SHACL shapes not loaded"
data_graph = Graph()
namespace = "http://ai-tax-agent.com/ontology/"
@app.post("/validate")
async def validate_graph(
current_user: dict[str, Any] = Depends(get_current_user),
tenant_id: str = Depends(get_tenant_id),
) -> dict[str, Any]:
"""Validate knowledge graph with SHACL"""
for node in nodes:
node_uri = URIRef(f"{namespace}{node['id']}")
data_graph.add((node_uri, RDF.type, URIRef(f"{namespace}{node['type']}")))
for key, value in node["properties"].items():
if value is not None:
data_graph.add((node_uri, URIRef(f"{namespace}{key}"), Literal(value)))
with tracer.start_as_current_span("validate_graph") as span:
span.set_attribute("tenant_id", tenant_id)
try:
if not shacl_validator:
raise HTTPException(
status_code=501, detail="SHACL validation not configured"
)
# Export current graph state
rdf_export = await neo4j_client.export_to_rdf("turtle")
# Extract RDF data from export result
rdf_data = rdf_export.get("rdf_data", "")
if not rdf_data:
raise HTTPException(
status_code=500, detail="Failed to export RDF data for validation"
)
# Run SHACL validation
validation_result = await shacl_validator.validate_graph(rdf_data)
# Update metrics
metrics.counter("validations_total").labels(
tenant_id=tenant_id, conforms=validation_result["conforms"]
).inc()
return {
"conforms": validation_result["conforms"],
"violations_count": validation_result["violations_count"],
"results_text": validation_result["results_text"],
"validated_at": datetime.utcnow().isoformat(),
}
except Exception as e:
logger.error("Graph validation failed", error=str(e))
raise HTTPException(status_code=500, detail=f"Validation failed: {str(e)}")
async def _validate_node(label: str, properties: dict[str, Any]) -> bool:
"""Validate node with SHACL"""
if not shacl_validator:
return True
for rel in relationships:
source_uri = URIRef(f"{namespace}{rel['sourceId']}")
target_uri = URIRef(f"{namespace}{rel['targetId']}")
rel_uri = URIRef(f"{namespace}{rel['type']}")
data_graph.add((source_uri, rel_uri, target_uri))
try:
# Create a minimal RDF representation of the node for validation
rdf_lines = ["@prefix tax: <https://tax-kg.example.com/> ."]
node_uri = "tax:temp_node"
# Add type declaration
rdf_lines.append(f"{node_uri} a tax:{label} .")
# Add properties
for prop, value in properties.items():
if isinstance(value, str):
rdf_lines.append(f'{node_uri} tax:{prop} "{value}" .')
else:
rdf_lines.append(f"{node_uri} tax:{prop} {value} .")
rdf_data = "\n".join(rdf_lines)
# Validate the node RDF data
validation_result = await shacl_validator.validate_graph(rdf_data)
if not validation_result["conforms"]:
logger.warning(
"Node SHACL validation failed",
label=label,
violations=validation_result["violations_count"],
details=validation_result["results_text"],
)
return False
logger.debug("Node SHACL validation passed", label=label)
return True
conforms, results_graph, results_text = validate(
data_graph,
shacl_graph=shapes_graph,
ont_graph=None, # No ontology graph
inference="rdfs",
abort_on_first=False,
allow_infos=False,
meta_shacl=False,
advanced=False,
js=False,
debug=False,
)
return conforms, results_text
except Exception as e:
logger.error("Node SHACL validation error", label=label, error=str(e))
# Return True to not block operations on validation errors
return True
def _is_safe_query(query: str) -> bool:
"""Basic query safety check"""
query_lower = query.lower()
# Block dangerous operations
dangerous_keywords = [
"delete",
"remove",
"drop",
"create index",
"create constraint",
"load csv",
"call",
"foreach",
]
for keyword in dangerous_keywords:
if keyword in query_lower:
return False
return True
logger.error("Error during SHACL validation", error=str(e))
return False, str(e)
@app.exception_handler(HTTPException)
@@ -561,7 +225,7 @@ async def http_exception_handler(request: Request, exc: HTTPException) -> JSONRe
status=exc.status_code,
detail=exc.detail,
instance=str(request.url),
trace_id="",
trace_id=getattr(request.state, "trace_id", None),
).model_dump(),
)

View File

@@ -1,22 +1,2 @@
# Service-specific dependencies
# RDF and semantic web
rdflib>=7.2.1
pyshacl>=0.30.1
# Graph algorithms
networkx>=3.5
# Data export formats
xmltodict>=1.0.2
# Query optimization
pyparsing>=3.2.5
# Graph visualization (optional)
graphviz>=0.21
# Additional Neo4j utilities
neomodel>=5.5.3
# Cypher query building
py2neo>=2021.2.4
setuptools
pyshacl==0.23.0

View File

@@ -1,53 +1,27 @@
# Multi-stage build for svc_normalize_map
FROM python:3.12-slim AS builder
FROM python:3.12-slim-bookworm
# Install build dependencies
RUN apt-get update && apt-get install -y \
build-essential \
curl \
&& rm -rf /var/lib/apt/lists/*
# Set environment variables
ENV PYTHONUNBUFFERED 1
ENV APP_HOME /app
# Create virtual environment
RUN python -m venv /opt/venv
ENV PATH="/opt/venv/bin:$PATH"
# Create and set working directory
WORKDIR $APP_HOME
# Copy requirements and install dependencies
# Install dependencies
COPY libs/requirements-base.txt /tmp/libs-requirements.txt
COPY apps/svc_normalize_map/requirements.txt /tmp/requirements.txt
RUN pip install --no-cache-dir --upgrade pip && \
pip install --no-cache-dir -r /tmp/libs-requirements.txt -r /tmp/requirements.txt
# Production stage
FROM python:3.12-slim
# Install runtime dependencies
RUN apt-get update && apt-get install -y \
curl \
&& rm -rf /var/lib/apt/lists/* \
&& groupadd -r appuser \
&& useradd -r -g appuser appuser
# Copy virtual environment from builder
COPY --from=builder /opt/venv /opt/venv
ENV PATH="/opt/venv/bin:$PATH"
# Set working directory
WORKDIR /app
RUN pip install --no-cache-dir -r /tmp/libs-requirements.txt -r /tmp/requirements.txt
# Copy application code
COPY libs/ ./libs/
COPY apps/svc_normalize_map/ ./apps/svc_normalize_map/
# Create non-root user and set permissions
RUN chown -R appuser:appuser /app
USER appuser
# Health check
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
CMD curl -f http://localhost:8000/healthz || exit 1
# Expose port
EXPOSE 8000
# Run the application
CMD ["python", "-m", "uvicorn", "apps.svc_normalize_map.main:app", "--host", "0.0.0.0", "--port", "8000"]

View File

@@ -1,24 +1,11 @@
"""Data normalization and knowledge graph mapping."""
# FILE: apps/svc-normalize-map/main.py
# pylint: disable=wrong-import-position,import-error,too-few-public-methods,global-statement
# pylint: disable=global-variable-not-assigned,raise-missing-from,unused-argument
# pylint: disable=broad-exception-caught,no-else-return,too-many-arguments,too-many-positional-arguments
# pylint: disable=too-many-locals,import-outside-toplevel,too-many-statements
# mypy: disable-error-code=union-attr
import os
# Import shared libraries
import sys
from datetime import datetime
from decimal import Decimal
from typing import Any
from datetime import UTC, datetime
from typing import Any, cast
import structlog
import ulid
from fastapi import BackgroundTasks, Depends, HTTPException, Request
from fastapi import HTTPException, Request
from fastapi.responses import JSONResponse
sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
@@ -34,83 +21,68 @@ from libs.events import EventBus, EventPayload, EventTopics
from libs.neo import Neo4jClient
from libs.observability import get_metrics, get_tracer, setup_observability
from libs.schemas import ErrorResponse
from libs.security import get_current_user, get_tenant_id
from libs.storage import DocumentStorage, StorageClient
logger = structlog.get_logger()
class NormalizeMapSettings(BaseAppSettings):
"""Settings for normalize-map service"""
"""Settings for NormalizeMap service"""
service_name: str = "svc-normalize-map"
# Normalization configuration
currency_default: str = "GBP"
date_formats: list[str] = [
"%Y-%m-%d",
"%d/%m/%Y",
"%d-%m-%Y",
"%d %B %Y",
"%d %b %Y",
"%B %d, %Y",
]
# Mapping configuration
confidence_threshold: float = 0.7
auto_create_entities: bool = True
# Validation rules
max_amount: float = 1000000.0 # £1M
min_confidence: float = 0.5
# Create app and settings
app, settings = create_app(
service_name="svc-normalize-map",
title="Tax Agent Normalize-Map Service",
description="Data normalization and knowledge graph mapping service",
settings_class=NormalizeMapSettings,
)
# Global clients
storage_client: StorageClient | None = None
document_storage: DocumentStorage | None = None
neo4j_client: Neo4jClient | None = None
event_bus: EventBus | None = None
tracer = get_tracer("svc-normalize-map")
metrics = get_metrics()
neo4j_client: Neo4jClient | None = None
settings: NormalizeMapSettings
@app.on_event("startup")
async def startup_event() -> None:
async def init_dependencies(app_settings: NormalizeMapSettings) -> None:
"""Initialize service dependencies"""
global storage_client, document_storage, neo4j_client, event_bus
global storage_client, document_storage, event_bus, neo4j_client, settings
logger.info("Starting normalize-map service")
settings = app_settings
logger.info("Starting NormalizeMap service")
# Setup observability
setup_observability(settings)
# Initialize MinIO client
minio_client = create_minio_client(settings)
storage_client = StorageClient(minio_client)
document_storage = DocumentStorage(storage_client)
# Initialize Neo4j client
neo4j_driver = create_neo4j_client(settings)
neo4j_client = Neo4jClient(neo4j_driver)
# Initialize event bus
event_bus = create_event_bus(settings)
if not event_bus:
raise HTTPException(status_code=500, detail="Event bus not initialized")
await event_bus.start()
# Subscribe to extraction completion events
await event_bus.subscribe( # type: ignore
EventTopics.DOC_EXTRACTED, _handle_extraction_completed
)
await event_bus.subscribe(EventTopics.DOC_EXTRACTED, _handle_document_extracted)
logger.info("Normalize-map service started successfully")
logger.info("NormalizeMap service started successfully")
app, _settings = create_app(
service_name="svc-normalize-map",
title="Tax Agent Normalize and Map Service",
description="Normalize extracted data and map to Knowledge Graph",
settings_class=NormalizeMapSettings,
)
# Initialize dependencies immediately
@app.on_event("startup")
async def startup_event(): # type: ignore
await init_dependencies(cast(NormalizeMapSettings, _settings))
tracer = get_tracer("svc-normalize-map")
metrics = get_metrics()
@app.on_event("shutdown")
@@ -118,456 +90,235 @@ async def shutdown_event() -> None:
"""Cleanup service dependencies"""
global event_bus, neo4j_client
logger.info("Shutting down normalize-map service")
if neo4j_client:
await neo4j_client.close()
logger.info("Shutting down NormalizeMap service")
if event_bus:
await event_bus.stop()
logger.info("Normalize-map service shutdown complete")
if neo4j_client:
await neo4j_client.close()
logger.info("NormalizeMap service shutdown complete")
@app.get("/health")
async def health_check() -> dict[str, Any]:
"""Health check endpoint"""
return {
"status": "healthy",
"service": settings.service_name,
"version": settings.service_version,
"timestamp": datetime.utcnow().isoformat(),
}
async def _handle_document_extracted(topic: str, payload: EventPayload) -> None:
"""Handle document extracted events"""
data = payload.data
doc_id = data.get("doc_id")
tenant_id = data.get("tenant_id")
extracted_fields = data.get("extraction_results", {}).get("extracted_fields", {})
provenance = data.get("extraction_results", {}).get("provenance", [])
if not doc_id or not tenant_id or not extracted_fields:
logger.warning("Invalid document extracted event", data=data)
return
@app.post("/normalize/{doc_id}")
async def normalize_document(
doc_id: str,
background_tasks: BackgroundTasks,
current_user: dict[str, Any] = Depends(get_current_user),
tenant_id: str = Depends(get_tenant_id),
) -> dict[str, Any]:
"""Normalize and map document data to knowledge graph"""
with tracer.start_as_current_span("normalize_document") as span:
with tracer.start_as_current_span("normalize_and_map") as span:
span.set_attribute("doc_id", doc_id)
span.set_attribute("tenant_id", tenant_id)
try:
# Check if extraction results exist
extraction_results = await document_storage.get_extraction_result(
tenant_id, doc_id
)
if not extraction_results:
raise HTTPException(
status_code=404, detail="Extraction results not found"
)
# 1. Normalize data
normalized_data = await _normalize_data(extracted_fields)
# Generate normalization ID
normalization_id = str(ulid.new())
span.set_attribute("normalization_id", normalization_id)
# Start background normalization
background_tasks.add_task(
_normalize_and_map_async,
doc_id,
tenant_id,
extraction_results,
normalization_id,
current_user.get("sub", "system"),
# 2. Map to KG ontology
kg_upsert_payload = await _map_to_kg_ontology(
doc_id, tenant_id, normalized_data, provenance
)
logger.info(
"Normalization started",
doc_id=doc_id,
normalization_id=normalization_id,
# 3. Publish kg.upsert.ready event
event_payload = EventPayload(
data=kg_upsert_payload,
actor=payload.actor,
tenant_id=tenant_id,
trace_id=str(span.get_span_context().trace_id),
)
await event_bus.publish(EventTopics.KG_UPSERT_READY, event_payload) # type: ignore
return {
"normalization_id": normalization_id,
"doc_id": doc_id,
"status": "processing",
}
except HTTPException:
raise
except Exception as e:
logger.error("Failed to start normalization", doc_id=doc_id, error=str(e))
raise HTTPException(status_code=500, detail="Failed to start normalization")
async def _handle_extraction_completed(topic: str, payload: EventPayload) -> None:
"""Handle extraction completion events"""
try:
data = payload.data
doc_id = data.get("doc_id")
tenant_id = data.get("tenant_id")
confidence = data.get("confidence", 0.0)
if not doc_id or not tenant_id:
logger.warning("Invalid extraction completion event", data=data)
return
# Only auto-process if confidence is above threshold
if confidence >= settings.confidence_threshold:
logger.info(
"Auto-normalizing extracted document",
doc_id=doc_id,
confidence=confidence,
)
extraction_results = data.get("extraction_results")
if not extraction_results:
extraction_results = await document_storage.get_extraction_result(
tenant_id, doc_id
)
if extraction_results:
await _normalize_and_map_async(
doc_id=doc_id,
tenant_id=tenant_id,
extraction_results=extraction_results,
normalization_id=str(ulid.new()),
actor=payload.actor,
)
else:
logger.info(
"Skipping auto-normalization due to low confidence",
doc_id=doc_id,
confidence=confidence,
)
except Exception as e:
logger.error("Failed to handle extraction completion", error=str(e))
async def _normalize_and_map_async(
doc_id: str,
tenant_id: str,
extraction_results: dict[str, Any],
normalization_id: str,
actor: str,
) -> None:
"""Normalize and map data asynchronously"""
with tracer.start_as_current_span("normalize_and_map_async") as span:
span.set_attribute("doc_id", doc_id)
span.set_attribute("normalization_id", normalization_id)
try:
extracted_fields = extraction_results.get("extracted_fields", {})
provenance = extraction_results.get("provenance", [])
# Normalize extracted data
normalized_data = await _normalize_data(extracted_fields, provenance)
# Map to knowledge graph entities
entities = await _map_to_entities(normalized_data, doc_id, tenant_id)
# Store entities in knowledge graph
stored_entities = await _store_entities(entities, tenant_id)
# Create normalization results
normalization_results = {
"doc_id": doc_id,
"normalization_id": normalization_id,
"normalized_at": datetime.utcnow().isoformat(),
"normalized_data": normalized_data,
"entities": stored_entities,
"entity_count": len(stored_entities),
}
logger.info("Normalization completed", results=normalization_results)
# Update metrics
metrics.counter("documents_normalized_total").labels(
metrics.counter("normalized_documents_total").labels(
tenant_id=tenant_id
).inc()
metrics.histogram("entities_created").labels(tenant_id=tenant_id).observe(
len(stored_entities)
)
# Publish completion event
event_payload = EventPayload(
data={
"doc_id": doc_id,
"tenant_id": tenant_id,
"normalization_id": normalization_id,
"entity_count": len(stored_entities),
"entities": stored_entities,
},
actor=actor,
tenant_id=tenant_id,
)
await event_bus.publish(EventTopics.KG_UPSERTED, event_payload)
logger.info(
"Normalization completed", doc_id=doc_id, entities=len(stored_entities)
"Document normalized and mapped", doc_id=doc_id, tenant_id=tenant_id
)
except Exception as e:
logger.error("Normalization failed", doc_id=doc_id, error=str(e))
# Update error metrics
logger.error(
"Failed to normalize and map document", doc_id=doc_id, error=str(e)
)
metrics.counter("normalization_errors_total").labels(
tenant_id=tenant_id, error_type=type(e).__name__
).inc()
async def _normalize_data(
extracted_fields: dict[str, Any], provenance: list[dict[str, Any]]
) -> dict[str, Any]:
"""Normalize extracted data"""
normalized = {}
for field_name, raw_value in extracted_fields.items():
try:
if "amount" in field_name.lower() or "total" in field_name.lower():
normalized[field_name] = _normalize_amount(raw_value)
elif "date" in field_name.lower():
normalized[field_name] = _normalize_date(raw_value)
elif "name" in field_name.lower():
normalized[field_name] = _normalize_name(raw_value)
elif "address" in field_name.lower():
normalized[field_name] = _normalize_address(raw_value)
elif "number" in field_name.lower():
normalized[field_name] = _normalize_number(raw_value)
else:
normalized[field_name] = _normalize_text(raw_value)
except Exception as e:
logger.warning(
"Failed to normalize field",
field=field_name,
value=raw_value,
error=str(e),
)
normalized[field_name] = raw_value # Keep original value
return normalized
def _normalize_amount(value: str) -> dict[str, Any]:
"""Normalize monetary amount"""
import re
if not value:
return {"amount": None, "currency": settings.currency_default}
# Remove currency symbols and formatting
clean_value = re.sub(r"[£$€,\s]", "", str(value))
try:
amount = Decimal(clean_value)
# Validate amount
if amount > settings.max_amount:
logger.warning("Amount exceeds maximum", amount=amount)
return {
"amount": float(amount),
"currency": settings.currency_default,
"original": value,
}
except Exception:
return {
"amount": None,
"currency": settings.currency_default,
"original": value,
}
def _normalize_date(value: str) -> dict[str, Any]:
"""Normalize date"""
from dateutil import parser
if not value:
return {"date": None, "original": value}
try:
# Try parsing with dateutil first
parsed_date = parser.parse(str(value), dayfirst=True)
return {"date": parsed_date.date().isoformat(), "original": value}
except Exception:
# Try manual formats
for fmt in settings.date_formats:
async def _normalize_data(extracted_fields: dict[str, Any]) -> dict[str, Any]:
"""Normalize extracted data into a consistent format"""
normalized_data = {}
for key, value in extracted_fields.items():
# Example: Simple date normalization (can be expanded)
if "date" in key.lower() and isinstance(value, str):
try:
parsed_date = datetime.strptime(str(value), fmt)
return {"date": parsed_date.date().isoformat(), "original": value}
except Exception:
continue
return {"date": None, "original": value}
# Attempt to parse various date formats
# Add more robust date parsing logic here as needed
normalized_data[key] = datetime.fromisoformat(value).date().isoformat()
except ValueError:
normalized_data[key] = value # Keep original if parsing fails
elif "amount" in key.lower() and isinstance(value, str):
# Example: Normalize currency to a Decimal
try:
normalized_data[key] = float(value.replace("£", "").replace(",", ""))
except ValueError:
normalized_data[key] = value
else:
normalized_data[key] = value
return normalized_data
def _normalize_name(value: str) -> dict[str, Any]:
"""Normalize person/company name"""
if not value:
return {"name": None, "original": value}
async def _map_to_kg_ontology(
doc_id: str,
tenant_id: str,
normalized_data: dict[str, Any],
provenance: list[dict[str, Any]],
) -> dict[str, Any]:
"""Map normalized data to Knowledge Graph ontology nodes and relationships based on kg_schema.json"""
nodes = []
relationships = []
now = datetime.now(UTC).isoformat()
# Clean and title case
clean_name = str(value).strip().title()
# Create a Document node
doc_node_id = f"document_{doc_id}"
nodes.append(
{
"id": doc_node_id,
"type": "Document",
"properties": {
"node_type": "Document",
"doc_id": doc_id,
"kind": normalized_data.get("kind", "OtherSupportingDoc"),
"source": normalized_data.get("source", "manual_upload"),
"checksum": normalized_data.get("checksum", ""),
"valid_from": now,
"asserted_at": now,
# "source": "svc-normalize-map",
"extractor_version": "1.0.0",
},
}
)
# Detect if it's a company (contains Ltd, Limited, etc.)
company_indicators = ["Ltd", "Limited", "Plc", "Inc", "Corp", "Company"]
is_company = any(indicator in clean_name for indicator in company_indicators)
# Create a TaxpayerProfile node
taxpayer_id = normalized_data.get("taxpayer_id", "unknown_taxpayer")
taxpayer_node_id = f"taxpayer_{taxpayer_id}"
nodes.append(
{
"id": taxpayer_node_id,
"type": "TaxpayerProfile",
"properties": {
"node_type": "TaxpayerProfile",
"taxpayer_id": taxpayer_id,
"type": "Individual",
"valid_from": now,
"asserted_at": now,
"source": "svc-normalize-map",
"extractor_version": "1.0.0",
},
}
)
relationships.append(
{
"id": f"rel_document_to_taxpayer_{doc_id}",
"type": "BELONGS_TO",
"sourceId": doc_node_id,
"targetId": taxpayer_node_id,
"properties": {},
}
)
# Create IncomeItem/ExpenseItem nodes and Evidence nodes
item_type = (
"IncomeItem" if normalized_data.get("kind") == "invoice" else "ExpenseItem"
)
for field, value in normalized_data.items():
if field in ["total_amount", "net_amount", "vat_amount", "amount"]:
item_id = f"item_{ulid.new()}"
item_node_id = f"{item_type.lower()}_{item_id}"
# Create the financial item node (IncomeItem or ExpenseItem)
nodes.append(
{
"id": item_node_id,
"type": item_type,
"properties": {
"node_type": item_type,
"type": (
"self_employment"
if "invoice" in normalized_data.get("kind", "")
else "other"
),
"gross": value,
"currency": "GBP",
"description": normalized_data.get("description", field),
"valid_from": now,
"asserted_at": now,
"source": "svc-normalize-map",
"extractor_version": "1.0.0",
},
}
)
relationships.append(
{
"id": f"rel_taxpayer_has_{item_type.lower()}_{item_id}",
"type": (
"HAS_INCOME" if item_type == "IncomeItem" else "HAS_EXPENSE"
),
"sourceId": taxpayer_node_id,
"targetId": item_node_id,
"properties": {},
}
)
# Create an Evidence node linking the item to the document
prov = next((p for p in provenance if p["field"] == field), None)
if prov:
evidence_id = f"evidence_{item_id}"
nodes.append(
{
"id": evidence_id,
"type": "Evidence",
"properties": {
"node_type": "Evidence",
"snippet_id": evidence_id,
"doc_ref": doc_id,
"page": prov.get("page"),
"bbox": prov.get("bbox"),
"text_hash": "dummy_hash", # Placeholder
"ocr_confidence": prov.get("confidence"),
"extracted_text": str(value),
"valid_from": now,
"asserted_at": now,
"source": "svc-normalize-map",
"extractor_version": "1.0.0",
},
}
)
relationships.append(
{
"id": f"rel_item_supported_by_evidence_{item_id}",
"type": "SUPPORTED_BY",
"sourceId": item_node_id,
"targetId": evidence_id,
"properties": {},
}
)
return {
"name": clean_name,
"type": "company" if is_company else "person",
"original": value,
"nodes": nodes,
"relationships": relationships,
"document_id": doc_id,
"tenant_id": tenant_id,
}
def _normalize_address(value: str) -> dict[str, Any]:
"""Normalize address"""
import re
if not value:
return {"address": None, "original": value}
clean_address = str(value).strip()
# Extract UK postcode
postcode_pattern = r"\b[A-Z]{1,2}\d[A-Z0-9]?\s*\d[A-Z]{2}\b"
postcode_match = re.search(postcode_pattern, clean_address, re.IGNORECASE)
postcode = postcode_match.group().upper() if postcode_match else None
return {"address": clean_address, "postcode": postcode, "original": value}
def _normalize_number(value: str) -> dict[str, Any]:
"""Normalize reference numbers"""
import re
if not value:
return {"number": None, "original": value}
# Remove spaces and special characters
clean_number = re.sub(r"[^\w]", "", str(value))
# Detect number type
number_type = "unknown"
if len(clean_number) == 10 and clean_number.isdigit():
number_type = "utr" # UTR is 10 digits
elif len(clean_number) == 8 and clean_number.isdigit():
number_type = "account_number"
elif re.match(r"^\d{6}$", clean_number):
number_type = "sort_code"
return {"number": clean_number, "type": number_type, "original": value}
def _normalize_text(value: str) -> dict[str, Any]:
"""Normalize general text"""
if not value:
return {"text": None, "original": value}
clean_text = str(value).strip()
return {"text": clean_text, "original": value}
async def _map_to_entities(
normalized_data: dict[str, Any], doc_id: str, tenant_id: str
) -> list[dict[str, Any]]:
"""Map normalized data to knowledge graph entities"""
entities = []
# Create document entity
doc_entity = {
"type": "Document",
"id": doc_id,
"properties": {
"doc_id": doc_id,
"tenant_id": tenant_id,
"processed_at": datetime.utcnow().isoformat(),
"source": "extraction",
"extractor_version": "1.0.0",
"valid_from": datetime.utcnow(),
"asserted_at": datetime.utcnow(),
},
}
entities.append(doc_entity)
# Map specific field types to entities
for field_name, normalized_value in normalized_data.items():
if isinstance(normalized_value, dict):
if "amount" in normalized_value and normalized_value["amount"] is not None:
# Create expense or income item
entity_type = (
"ExpenseItem" if "expense" in field_name.lower() else "IncomeItem"
)
entity = {
"type": entity_type,
"id": f"{entity_type.lower()}_{ulid.new()}",
"properties": {
"amount": normalized_value["amount"],
"currency": normalized_value["currency"],
"description": field_name,
"source": doc_id,
"extractor_version": "1.0.0",
"valid_from": datetime.utcnow(),
"asserted_at": datetime.utcnow(),
},
}
entities.append(entity)
elif "name" in normalized_value and normalized_value["name"] is not None:
# Create party entity
entity = {
"type": "Party",
"id": f"party_{ulid.new()}",
"properties": {
"name": normalized_value["name"],
"party_type": normalized_value.get("type", "unknown"),
"source": doc_id,
"extractor_version": "1.0.0",
"valid_from": datetime.utcnow(),
"asserted_at": datetime.utcnow(),
},
}
entities.append(entity)
return entities
async def _store_entities(
entities: list[dict[str, Any]], tenant_id: str
) -> list[dict[str, Any]]:
"""Store entities in knowledge graph"""
stored_entities = []
for entity in entities:
try:
# Create node in Neo4j
result = await neo4j_client.create_node(
label=entity["type"], properties=entity["properties"]
)
stored_entities.append(
{
"type": entity["type"],
"id": entity["id"],
"neo4j_id": result.get("id"),
"properties": entity["properties"],
}
)
logger.debug("Entity stored", type=entity["type"], id=entity["id"])
except Exception as e:
logger.error("Failed to store entity", entity=entity, error=str(e))
return stored_entities
@app.exception_handler(HTTPException)
async def http_exception_handler(request: Request, exc: HTTPException) -> JSONResponse:
"""Handle HTTP exceptions with RFC7807 format"""
@@ -579,8 +330,8 @@ async def http_exception_handler(request: Request, exc: HTTPException) -> JSONRe
status=exc.status_code,
detail=exc.detail,
instance=str(request.url),
trace_id="",
).dict(),
trace_id=getattr(request.state, "trace_id", None),
).model_dump(),
)

View File

@@ -1,37 +1 @@
# FastAPI and server
fastapi>=0.118.3
uvicorn[standard]>=0.37.0
pydantic>=2.12.0
# Service-specific dependencies
# Data normalization and cleaning
pandas>=2.3.3
numpy>=2.3.3
# Currency and exchange rates
forex-python>=1.9.2
babel>=2.17.0
# Date and time processing
python-dateutil>=2.9.0
pytz>=2025.2
# Text normalization
unidecode>=1.4.0
phonenumbers>=9.0.16
# Entity resolution and matching
recordlinkage>=0.16.0
fuzzywuzzy>=0.18.0
python-Levenshtein>=0.27.1
# Geographic data
geopy>=2.4.1
pycountry>=24.6.1
# Data validation
cerberus>=1.3.7
marshmallow>=4.0.1
# UK-specific utilities
uk-postcode-utils>=1.1
python-ulid

View File

@@ -7,13 +7,14 @@ import os
# Import shared libraries
import sys
from contextlib import asynccontextmanager
from datetime import datetime
from typing import Any, cast
import pytesseract
import structlog
import ulid
from fastapi import BackgroundTasks, Depends, HTTPException, Request
from fastapi import BackgroundTasks, Depends, FastAPI, HTTPException, Request
from fastapi.responses import JSONResponse
from pdf2image import convert_from_bytes
from PIL import Image
@@ -78,6 +79,8 @@ settings: OCRSettings
async def init_dependencies(app_settings: OCRSettings) -> None:
"""Initialize service dependencies"""
global storage_client, document_storage, event_bus, settings, vision_processor
# Larger delay to ensure NATS is fully ready before attempting connection
await asyncio.sleep(10)
settings = app_settings
logger.info("Starting OCR service")
@@ -89,17 +92,35 @@ async def init_dependencies(app_settings: OCRSettings) -> None:
minio_client = create_minio_client(settings)
storage_client = StorageClient(minio_client)
document_storage = DocumentStorage(storage_client)
# Initialize event bus
event_bus = create_event_bus(settings)
if not event_bus:
raise HTTPException(status_code=500, detail="Event bus not initialized")
eb = event_bus
# mypy: event_bus is Optional, so use local alias after check
await eb.start()
# Subscribe to document ingestion events
await eb.subscribe(EventTopics.DOC_INGESTED, _handle_document_ingested)
# Initialize event bus with retry logic
max_retries = 20
delay = 5
for attempt in range(1, max_retries + 1):
logger.info(
"Attempting NATS connection", url=settings.nats_servers, attempt=attempt
)
event_bus = create_event_bus(settings)
if not event_bus:
raise HTTPException(status_code=500, detail="Event bus not initialized")
eb = event_bus
try:
# Attempt to start and subscribe
await eb.start()
await eb.subscribe(EventTopics.DOC_INGESTED, _handle_document_ingested)
logger.info("NATS connection established on attempt", attempt=attempt)
break
except Exception as e:
logger.error(
"Failed to connect to NATS, retrying",
attempt=attempt,
error=str(e),
)
if attempt == max_retries:
raise HTTPException(
status_code=500, detail="Failed to connect to NATS after retries"
)
await asyncio.sleep(delay)
delay *= 2 # exponential backoff
# Initialize shared OCRProcessor for vision strategy
try:
@@ -114,7 +135,26 @@ async def init_dependencies(app_settings: OCRSettings) -> None:
logger.info("OCR service started successfully")
# Create app and settings
async def shutdown_dependencies() -> None:
"""Shutdown service dependencies"""
logger.info("Shutting down OCR service")
eb = event_bus
if eb is not None:
await eb.stop()
logger.info("OCR service shutdown complete")
@asynccontextmanager
async def lifespan(app: FastAPI): # type: ignore
"""FastAPI lifespan event handler"""
# Startup
await init_dependencies(cast(OCRSettings, _settings))
yield
# Shutdown
await shutdown_dependencies()
# Create app and settings with lifespan
app, _settings = create_app(
service_name="svc-ocr",
title="Tax Agent OCR Service",
@@ -122,8 +162,8 @@ app, _settings = create_app(
settings_class=OCRSettings,
) # fmt: skip
# Initialize dependencies immediately
asyncio.run(init_dependencies(cast(OCRSettings, _settings)))
# Override app's lifespan
app.router.lifespan_context = lifespan
tracer = get_tracer("svc-ocr")
metrics = get_metrics()

View File

@@ -14,3 +14,12 @@ opencv-python-headless>=4.12.0.88 # Headless version is smaller
# Computer vision (torchvision not in base-ml)
torchvision>=0.23.0
# OpenTelemetry (required by libs/observability)
opentelemetry-api>=1.21.0
opentelemetry-sdk>=1.21.0
opentelemetry-exporter-otlp-proto-grpc>=1.21.0
opentelemetry-instrumentation-fastapi>=0.42b0
opentelemetry-instrumentation-httpx>=0.42b0
opentelemetry-instrumentation-psycopg2>=0.42b0
opentelemetry-instrumentation-redis>=0.42b0

View File

@@ -10,12 +10,15 @@ FROM ${REGISTRY}/${OWNER}/base-ml:${BASE_VERSION}
# Switch to root to install service-specific dependencies
USER root
RUN apt-get update && apt-get install -y build-essential
# Set working directory
WORKDIR /app
# Copy service-specific requirements and install
COPY libs/requirements-base.txt /tmp/libs-requirements.txt
COPY apps/svc_rag_indexer/requirements.txt /tmp/service-requirements.txt
RUN pip install --no-cache-dir -r /tmp/service-requirements.txt
RUN pip install --no-cache-dir -r /tmp/libs-requirements.txt -r /tmp/service-requirements.txt
# Copy application code
COPY libs/ ./libs/
@@ -26,7 +29,7 @@ RUN chown -R appuser:appuser /app
USER appuser
# Health check
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
HEALTHCHECK --interval=30s --timeout=10s --start-period=30s --retries=3 \
CMD curl -f http://localhost:8000/healthz || exit 1
# Expose port

View File

@@ -10,12 +10,15 @@ FROM ${REGISTRY}/${OWNER}/base-ml:${BASE_VERSION}
# Switch to root to install service-specific dependencies
USER root
RUN apt-get update && apt-get install -y build-essential
# Set working directory
WORKDIR /app
# Copy service-specific requirements and install
COPY libs/requirements-base.txt /tmp/libs-requirements.txt
COPY apps/svc_rag_retriever/requirements.txt /tmp/service-requirements.txt
RUN pip install --no-cache-dir -r /tmp/service-requirements.txt
RUN pip install --no-cache-dir -r /tmp/libs-requirements.txt -r /tmp/service-requirements.txt
# Copy application code
COPY libs/ ./libs/

View File

@@ -43,7 +43,7 @@ RUN chown -R appuser:appuser /app
USER appuser
# Health check
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
HEALTHCHECK --interval=30s --timeout=10s --start-period=30s --retries=3 \
CMD curl -f http://localhost:8000/healthz || exit 1
# Expose port

View File

@@ -17,6 +17,7 @@ from datetime import datetime
from decimal import Decimal
from typing import Any
import httpx
import structlog
import ulid
from fastapi import BackgroundTasks, Depends, HTTPException, Request
@@ -55,6 +56,9 @@ class ReasonSettings(BaseAppSettings):
max_income: float = 10000000.0 # £10M
max_expenses: float = 10000000.0 # £10M
# External services
coverage_service_url: str = "http://svc-coverage:8000"
# Create app and settings
app, settings = create_app(
@@ -67,6 +71,7 @@ app, settings = create_app(
# Global clients
neo4j_client: Neo4jClient | None = None
event_bus: EventBus | None = None
http_client: httpx.AsyncClient | None = None
tracer = get_tracer("svc-reason")
metrics = get_metrics()
@@ -74,7 +79,7 @@ metrics = get_metrics()
@app.on_event("startup")
async def startup_event() -> None:
"""Initialize service dependencies"""
global neo4j_client, event_bus
global neo4j_client, event_bus, http_client
logger.info("Starting reasoning service")
@@ -89,6 +94,9 @@ async def startup_event() -> None:
event_bus = create_event_bus(settings)
await event_bus.start() # fmt: skip# pyright: ignore[reportOptionalMemberAccess]
# Initialize HTTP client
http_client = httpx.AsyncClient()
# Subscribe to KG upsert events
await event_bus.subscribe(EventTopics.KG_UPSERTED, _handle_kg_upserted) # type: ignore
@@ -98,7 +106,7 @@ async def startup_event() -> None:
@app.on_event("shutdown")
async def shutdown_event() -> None:
"""Cleanup service dependencies"""
global neo4j_client, event_bus
global neo4j_client, event_bus, http_client
logger.info("Shutting down reasoning service")
@@ -108,6 +116,9 @@ async def shutdown_event() -> None:
if event_bus:
await event_bus.stop()
if http_client:
await http_client.aclose()
logger.info("Reasoning service shutdown complete")
@@ -259,41 +270,76 @@ async def get_calculation_results(
async def _handle_kg_upserted(topic: str, payload: EventPayload) -> None:
"""Handle KG upsert events for auto-calculation"""
"""Handle KG upsert events for auto-calculation and coverage check"""
data = payload.data
taxpayer_id = data.get("taxpayer_id")
tax_year = data.get("tax_year")
tenant_id = data.get("tenant_id")
if not taxpayer_id or not tax_year or not tenant_id:
logger.warning("Invalid KG upsert event data for coverage check", data=data)
return
# Trigger svc-coverage check
try:
data = payload.data
entities = data.get("entities", [])
tenant_id = data.get("tenant_id")
# Check if we have enough data for calculation
has_income = any(e.get("type") == "IncomeItem" for e in entities)
has_expenses = any(e.get("type") == "ExpenseItem" for e in entities)
if has_income or has_expenses:
if http_client:
coverage_url = f"{settings.coverage_service_url}/v1/coverage/check"
request_body = {
"tax_year": tax_year,
"taxpayer_id": taxpayer_id,
}
headers = {
"X-Tenant-ID": tenant_id,
# Assuming current_user is not directly available here,
# or a system user token needs to be generated.
# For now, omitting X-Authenticated-User for simplicity,
# but in a real system, this should be handled securely.
}
response = await http_client.post(coverage_url, json=request_body, headers=headers)
response.raise_for_status()
coverage_report = response.json()
logger.info(
"Auto-triggering calculation due to new financial data",
tenant_id=tenant_id,
"Triggered svc-coverage check",
taxpayer_id=taxpayer_id,
tax_year=tax_year,
coverage_status=coverage_report.get("overall_status"),
)
# Find taxpayer ID from entities
taxpayer_id = None
for entity in entities:
if entity.get("type") == "TaxpayerProfile":
taxpayer_id = entity.get("id")
break
if taxpayer_id:
# If coverage is complete, trigger calculation
if coverage_report.get("overall_status") == "complete":
logger.info(
"Coverage complete, auto-triggering calculation",
taxpayer_id=taxpayer_id,
tax_year=tax_year,
)
await _compute_schedule_async(
tax_year=settings.current_tax_year,
tax_year=tax_year,
taxpayer_id=taxpayer_id,
schedule_id="SA103", # Default to self-employment
tenant_id=tenant_id or "",
tenant_id=tenant_id,
calculation_id=str(ulid.new()),
actor=payload.actor,
)
else:
logger.info(
"Coverage incomplete, not triggering calculation",
taxpayer_id=taxpayer_id,
tax_year=tax_year,
blocking_items=coverage_report.get("blocking_items"),
)
except httpx.HTTPStatusError as e:
logger.error(
"Failed to trigger svc-coverage check due to HTTP error",
taxpayer_id=taxpayer_id,
tax_year=tax_year,
error=str(e),
response_status_code=e.response.status_code,
response_text=e.response.text,
)
except Exception as e:
logger.error("Failed to handle KG upsert for auto-calculation", error=str(e))
logger.error("Failed to handle KG upsert for auto-calculation or coverage check", error=str(e))
async def _compute_schedule_async(
@@ -570,16 +616,107 @@ async def _compute_sa105(
async def _compute_sa100(
financial_data: dict[str, Any], tax_year: str
) -> tuple[dict[str, Any], list[dict[str, Any]]]:
"""Compute SA100 (Main return) schedule"""
# This would aggregate from other schedules
# For now, return basic structure
form_boxes = {
"1": {"value": "John Doe", "description": "Your name", "confidence": 0.9}
}
"""Compute SA100 (Main return) schedule by aggregating other schedules"""
form_boxes = {}
evidence_trail: list[dict[str, Any]] = []
taxpayer_id = financial_data.get("taxpayer_id")
tenant_id = financial_data.get("tenant_id") # Assuming tenant_id is passed in financial_data
if not taxpayer_id or not tenant_id:
raise ValueError("Taxpayer ID or Tenant ID missing for SA100 computation")
# Get latest SA103 calculation
sa103_query = """
MATCH (t:TaxpayerProfile {taxpayer_id: $taxpayer_id, tenant_id: $tenant_id})-[:HAS_CALCULATION]->(c:Calculation)
WHERE c.schedule = 'SA103' AND c.tax_year = $tax_year AND c.retracted_at IS NULL
OPTIONAL MATCH (c)-[:HAS_BOX]->(b:FormBox)
RETURN c.calculation_id AS calculation_id, c.calculated_at AS calculated_at, COLLECT({box: b.box, value: b.value, description: b.description, confidence: b.confidence}) AS form_boxes
ORDER BY c.calculated_at DESC
LIMIT 1
"""
sa103_results = await neo4j_client.run_query( # type: ignore
sa103_query, {"taxpayer_id": taxpayer_id, "tenant_id": tenant_id, "tax_year": tax_year}
)
sa103_calc = sa103_results[0] if sa103_results else None
sa103_net_profit = Decimal("0")
if sa103_calc and sa103_calc["form_boxes"]:
for box in sa103_calc["form_boxes"]:
if box["box"] == "32": # Net profit box in SA103
sa103_net_profit = Decimal(str(box["value"]))
form_boxes["SA103_32"] = {"value": float(sa103_net_profit), "description": "SA103 Net Profit", "confidence": box.get("confidence", 0.9)}
evidence_trail.append({
"box": "SA103_32",
"source_calculation_id": sa103_calc["calculation_id"],
"description": "Derived from SA103 Net Profit"
})
break
# Get latest SA105 calculation
sa105_query = """
MATCH (t:TaxpayerProfile {taxpayer_id: $taxpayer_id, tenant_id: $tenant_id})-[:HAS_CALCULATION]->(c:Calculation)
WHERE c.schedule = 'SA105' AND c.tax_year = $tax_year AND c.retracted_at IS NULL
OPTIONAL MATCH (c)-[:HAS_BOX]->(b:FormBox)
RETURN c.calculation_id AS calculation_id, c.calculated_at AS calculated_at, COLLECT({box: b.box, value: b.value, description: b.description, confidence: b.confidence}) AS form_boxes
ORDER BY c.calculated_at DESC
LIMIT 1
"""
sa105_results = await neo4j_client.run_query( # type: ignore
sa105_query, {"taxpayer_id": taxpayer_id, "tenant_id": tenant_id, "tax_year": tax_year}
)
sa105_calc = sa105_results[0] if sa105_results else None
sa105_net_income = Decimal("0")
if sa105_calc and sa105_calc["form_boxes"]:
for box in sa105_calc["form_boxes"]:
if box["box"] == "net_income": # Net property income box in SA105 (custom box for internal calculation)
sa105_net_income = Decimal(str(box["value"]))
form_boxes["SA105_net_income"] = {"value": float(sa105_net_income), "description": "SA105 Net Property Income", "confidence": box.get("confidence", 0.9)}
evidence_trail.append({
"box": "SA105_net_income",
"source_calculation_id": sa105_calc["calculation_id"],
"description": "Derived from SA105 Net Property Income"
})
break
# Aggregate total income for SA100
total_income = sa103_net_profit + sa105_net_income
form_boxes["SA100_total_income"] = {
"value": float(total_income),
"description": "Total income from all sources",
"confidence": 0.95 # Higher confidence for aggregated value
}
evidence_trail.append({
"box": "SA100_total_income",
"derived_from": ["SA103_32", "SA105_net_income"],
"description": "Aggregated from SA103 net profit and SA105 net property income"
})
# Example: Basic personal allowance (simplified)
personal_allowance = Decimal("12570") # For 2023-24
if total_income > Decimal("100000"): # Tapering not implemented here
personal_allowance = Decimal("0")
form_boxes["SA100_personal_allowance"] = {
"value": float(personal_allowance),
"description": "Personal Allowance",
"confidence": 0.99
}
evidence_trail.append({
"box": "SA100_personal_allowance",
"source": "HMRC_guidance",
"description": f"Standard personal allowance for {tax_year}"
})
# Placeholder for actual SA100 boxes and complex calculations
# This would involve detailed tax band calculations, reliefs, etc.
# For now, we'll just show the aggregation.
form_boxes["1"] = {"value": "John Doe (Aggregated)", "description": "Your name", "confidence": 0.9}
return form_boxes, evidence_trail

View File

@@ -33,3 +33,4 @@ jinja2>=3.1.6
# Statistical calculations
scipy>=1.16.2
httpx