# FILE: apps/svc-kg/main.py # Knowledge graph facade with CRUD, queries, lineage, and SHACL validation import json import os # Import shared libraries import sys from datetime import datetime from typing import Any import structlog from fastapi import Depends, HTTPException, Query, Request from fastapi.responses import JSONResponse sys.path.append(os.path.join(os.path.dirname(__file__), "..", "..")) from libs.app_factory import create_app from libs.config import BaseAppSettings, create_event_bus, create_neo4j_client from libs.events import EventBus from libs.neo import Neo4jClient, SHACLValidator, TemporalQueries from libs.observability import get_metrics, get_tracer, setup_observability from libs.schemas import ErrorResponse from libs.security import get_current_user, get_tenant_id logger = structlog.get_logger() class KGSettings(BaseAppSettings): """Settings for KG service""" service_name: str = "svc-kg" # SHACL validation shapes_file: str = "schemas/shapes.ttl" validate_on_write: bool = True # Query limits max_results: int = 1000 max_depth: int = 10 query_timeout: int = 30 # Create app and settings app, settings = create_app( service_name="svc-kg", title="Tax Agent Knowledge Graph Service", description="Knowledge graph facade with CRUD and queries", settings_class=KGSettings, ) # Global clients neo4j_client: Neo4jClient | None = None shacl_validator: SHACLValidator | None = None event_bus: EventBus | None = None tracer = get_tracer("svc-kg") metrics = get_metrics() @app.on_event("startup") async def startup_event() -> None: """Initialize service dependencies""" global neo4j_client, shacl_validator, event_bus logger.info("Starting KG service") # Setup observability setup_observability(settings) # Initialize Neo4j client neo4j_driver = create_neo4j_client(settings) neo4j_client = Neo4jClient(neo4j_driver) # Initialize SHACL validator if os.path.exists(settings.shapes_file): shacl_validator = SHACLValidator(settings.shapes_file) # Initialize event bus event_bus = create_event_bus(settings) await event_bus.start() logger.info("KG service started successfully") @app.on_event("shutdown") async def shutdown_event() -> None: """Cleanup service dependencies""" global neo4j_client, event_bus logger.info("Shutting down KG service") if neo4j_client: await neo4j_client.close() if event_bus: await event_bus.stop() logger.info("KG service shutdown complete") @app.get("/health") async def health_check() -> dict[str, Any]: """Health check endpoint""" return { "status": "healthy", "service": settings.service_name, "version": settings.service_version, "timestamp": datetime.utcnow().isoformat(), } @app.post("/nodes/{label}") async def create_node( label: str, properties: dict[str, Any], current_user: dict[str, Any] = Depends(get_current_user), tenant_id: str = Depends(get_tenant_id), ) -> dict[str, Any]: """Create a new node""" with tracer.start_as_current_span("create_node") as span: span.set_attribute("label", label) span.set_attribute("tenant_id", tenant_id) try: # Add tenant isolation properties["tenant_id"] = tenant_id properties["created_by"] = current_user.get("sub", "system") # Validate with SHACL if enabled if settings.validate_on_write and shacl_validator: await _validate_node(label, properties) # Create node result = await neo4j_client.create_node(label, properties) # Update metrics metrics.counter("nodes_created_total").labels( tenant_id=tenant_id, label=label ).inc() logger.info("Node created", label=label, node_id=result.get("id")) return { "status": "created", "label": label, "properties": properties, "neo4j_result": result, } except Exception as e: logger.error("Failed to create node", label=label, error=str(e)) raise HTTPException( status_code=500, detail=f"Failed to create node: {str(e)}" ) @app.get("/nodes/{label}") async def get_nodes( label: str, limit: int = Query(default=100, le=settings.max_results), filters: str | None = Query(default=None), current_user: dict[str, Any] = Depends(get_current_user), tenant_id: str = Depends(get_tenant_id), ) -> dict[str, Any]: """Get nodes by label with optional filters""" with tracer.start_as_current_span("get_nodes") as span: span.set_attribute("label", label) span.set_attribute("tenant_id", tenant_id) span.set_attribute("limit", limit) try: # Parse filters filter_dict: dict[str, Any] = {} if filters: try: filter_dict = json.loads(filters) except json.JSONDecodeError: raise HTTPException(status_code=400, detail="Invalid filters JSON") # Add tenant isolation filter_dict["tenant_id"] = tenant_id # Build query query = TemporalQueries.get_current_state_query(label, filter_dict) query += f" LIMIT {limit}" # Execute query results = await neo4j_client.run_query(query) # Update metrics metrics.counter("nodes_queried_total").labels( tenant_id=tenant_id, label=label ).inc() return { "label": label, "count": len(results), "nodes": [result["n"] for result in results], } except HTTPException: raise except Exception as e: logger.error("Failed to get nodes", label=label, error=str(e)) raise HTTPException( status_code=500, detail=f"Failed to get nodes: {str(e)}" ) @app.get("/nodes/{label}/{node_id}") async def get_node( label: str, node_id: str, include_lineage: bool = Query(default=False), current_user: dict[str, Any] = Depends(get_current_user), tenant_id: str = Depends(get_tenant_id), ) -> dict[str, Any]: """Get specific node with optional lineage""" with tracer.start_as_current_span("get_node") as span: span.set_attribute("label", label) span.set_attribute("node_id", node_id) span.set_attribute("tenant_id", tenant_id) try: # Get node query = f""" MATCH (n:{label} {{id: $node_id, tenant_id: $tenant_id}}) WHERE n.retracted_at IS NULL RETURN n """ results = await neo4j_client.run_query( query, {"node_id": node_id, "tenant_id": tenant_id} ) if not results: raise HTTPException(status_code=404, detail="Node not found") node_data = results[0]["n"] # Get lineage if requested lineage: list[dict[str, Any]] = [] if include_lineage: lineage = await neo4j_client.get_node_lineage(node_id) return {"node": node_data, "lineage": lineage if include_lineage else None} except HTTPException: raise except Exception as e: logger.error( "Failed to get node", label=label, node_id=node_id, error=str(e) ) raise HTTPException(status_code=500, detail=f"Failed to get node: {str(e)}") @app.put("/nodes/{label}/{node_id}") async def update_node( label: str, node_id: str, properties: dict[str, Any], current_user: dict[str, Any] = Depends(get_current_user), tenant_id: str = Depends(get_tenant_id), ) -> dict[str, Any]: """Update node with bitemporal versioning""" with tracer.start_as_current_span("update_node") as span: span.set_attribute("label", label) span.set_attribute("node_id", node_id) span.set_attribute("tenant_id", tenant_id) try: # Add metadata properties["tenant_id"] = tenant_id properties["updated_by"] = current_user.get("sub", "system") # Validate with SHACL if enabled if settings.validate_on_write and shacl_validator: await _validate_node(label, properties) # Update node (creates new version) await neo4j_client.update_node(label, node_id, properties) # Update metrics metrics.counter("nodes_updated_total").labels( tenant_id=tenant_id, label=label ).inc() logger.info("Node updated", label=label, node_id=node_id) return { "status": "updated", "label": label, "node_id": node_id, "properties": properties, } except Exception as e: logger.error( "Failed to update node", label=label, node_id=node_id, error=str(e) ) raise HTTPException( status_code=500, detail=f"Failed to update node: {str(e)}" ) @app.post("/relationships") async def create_relationship( from_label: str, from_id: str, to_label: str, to_id: str, relationship_type: str, properties: dict[str, Any] | None = None, current_user: dict[str, Any] = Depends(get_current_user), tenant_id: str = Depends(get_tenant_id), ) -> dict[str, Any]: """Create relationship between nodes""" with tracer.start_as_current_span("create_relationship") as span: span.set_attribute("from_label", from_label) span.set_attribute("to_label", to_label) span.set_attribute("relationship_type", relationship_type) span.set_attribute("tenant_id", tenant_id) try: # Add metadata rel_properties = properties or {} rel_properties["tenant_id"] = tenant_id rel_properties["created_by"] = current_user.get("sub", "system") # Create relationship await neo4j_client.create_relationship( from_label, from_id, to_label, to_id, relationship_type, rel_properties ) # Update metrics metrics.counter("relationships_created_total").labels( tenant_id=tenant_id, relationship_type=relationship_type ).inc() logger.info( "Relationship created", from_id=from_id, to_id=to_id, type=relationship_type, ) return { "status": "created", "from_id": from_id, "to_id": to_id, "relationship_type": relationship_type, "properties": rel_properties, } except Exception as e: logger.error("Failed to create relationship", error=str(e)) raise HTTPException( status_code=500, detail=f"Failed to create relationship: {str(e)}" ) @app.post("/query") async def execute_query( query: str, parameters: dict[str, Any] | None = None, current_user: dict[str, Any] = Depends(get_current_user), tenant_id: str = Depends(get_tenant_id), ) -> dict[str, Any]: """Execute custom Cypher query with tenant isolation""" with tracer.start_as_current_span("execute_query") as span: span.set_attribute("tenant_id", tenant_id) try: # Add tenant isolation to parameters query_params = parameters or {} query_params["tenant_id"] = tenant_id # Validate query (basic security check) if not _is_safe_query(query): raise HTTPException(status_code=400, detail="Unsafe query detected") # Execute query with timeout results = await neo4j_client.run_query(query, query_params, max_retries=1) # Update metrics metrics.counter("custom_queries_total").labels(tenant_id=tenant_id).inc() return { "query": query, "parameters": query_params, "results": results, "count": len(results), } except Exception as e: logger.error("Query execution failed", query=query[:100], error=str(e)) raise HTTPException(status_code=500, detail=f"Query failed: {str(e)}") @app.get("/export/rdf") async def export_rdf( format: str = Query(default="turtle"), current_user: dict[str, Any] = Depends(get_current_user), tenant_id: str = Depends(get_tenant_id), ) -> dict[str, Any]: """Export knowledge graph as RDF""" with tracer.start_as_current_span("export_rdf") as span: span.set_attribute("format", format) span.set_attribute("tenant_id", tenant_id) try: # Export tenant-specific data rdf_data = await neo4j_client.export_to_rdf(format) # Update metrics metrics.counter("rdf_exports_total").labels( tenant_id=tenant_id, format=format ).inc() return { "format": format, "rdf_data": rdf_data, "exported_at": datetime.utcnow().isoformat(), } except Exception as e: logger.error("RDF export failed", format=format, error=str(e)) raise HTTPException( status_code=500, detail=f"RDF export failed: {str(e)}" ) from e @app.post("/validate") async def validate_graph( current_user: dict[str, Any] = Depends(get_current_user), tenant_id: str = Depends(get_tenant_id), ) -> dict[str, Any]: """Validate knowledge graph with SHACL""" with tracer.start_as_current_span("validate_graph") as span: span.set_attribute("tenant_id", tenant_id) try: if not shacl_validator: raise HTTPException( status_code=501, detail="SHACL validation not configured" ) # Export current graph state rdf_export = await neo4j_client.export_to_rdf("turtle") # Extract RDF data from export result rdf_data = rdf_export.get("rdf_data", "") if not rdf_data: raise HTTPException( status_code=500, detail="Failed to export RDF data for validation" ) # Run SHACL validation validation_result = await shacl_validator.validate_graph(rdf_data) # Update metrics metrics.counter("validations_total").labels( tenant_id=tenant_id, conforms=validation_result["conforms"] ).inc() return { "conforms": validation_result["conforms"], "violations_count": validation_result["violations_count"], "results_text": validation_result["results_text"], "validated_at": datetime.utcnow().isoformat(), } except Exception as e: logger.error("Graph validation failed", error=str(e)) raise HTTPException(status_code=500, detail=f"Validation failed: {str(e)}") async def _validate_node(label: str, properties: dict[str, Any]) -> bool: """Validate node with SHACL""" if not shacl_validator: return True try: # Create a minimal RDF representation of the node for validation rdf_lines = ["@prefix tax: ."] node_uri = "tax:temp_node" # Add type declaration rdf_lines.append(f"{node_uri} a tax:{label} .") # Add properties for prop, value in properties.items(): if isinstance(value, str): rdf_lines.append(f'{node_uri} tax:{prop} "{value}" .') else: rdf_lines.append(f"{node_uri} tax:{prop} {value} .") rdf_data = "\n".join(rdf_lines) # Validate the node RDF data validation_result = await shacl_validator.validate_graph(rdf_data) if not validation_result["conforms"]: logger.warning( "Node SHACL validation failed", label=label, violations=validation_result["violations_count"], details=validation_result["results_text"], ) return False logger.debug("Node SHACL validation passed", label=label) return True except Exception as e: logger.error("Node SHACL validation error", label=label, error=str(e)) # Return True to not block operations on validation errors return True def _is_safe_query(query: str) -> bool: """Basic query safety check""" query_lower = query.lower() # Block dangerous operations dangerous_keywords = [ "delete", "remove", "drop", "create index", "create constraint", "load csv", "call", "foreach", ] for keyword in dangerous_keywords: if keyword in query_lower: return False return True @app.exception_handler(HTTPException) async def http_exception_handler(request: Request, exc: HTTPException) -> JSONResponse: """Handle HTTP exceptions with RFC7807 format""" return JSONResponse( status_code=exc.status_code, content=ErrorResponse( type=f"https://httpstatuses.com/{exc.status_code}", title=exc.detail, status=exc.status_code, detail=exc.detail, instance=str(request.url), trace_id="", ).model_dump(), ) if __name__ == "__main__": import uvicorn uvicorn.run("main:app", host="0.0.0.0", port=8005, reload=True, log_config=None)