Initial commit
Some checks failed
CI/CD Pipeline / Code Quality & Linting (push) Has been cancelled
CI/CD Pipeline / Policy Validation (push) Has been cancelled
CI/CD Pipeline / Test Suite (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-firm-connectors) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-forms) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-hmrc) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ingestion) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-normalize-map) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ocr) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-indexer) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-reason) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rpa) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (ui-review) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (ui-review) (push) Has been cancelled
CI/CD Pipeline / Generate SBOM (push) Has been cancelled
CI/CD Pipeline / Deploy to Staging (push) Has been cancelled
CI/CD Pipeline / Deploy to Production (push) Has been cancelled
CI/CD Pipeline / Notifications (push) Has been cancelled

This commit is contained in:
harkon
2025-10-11 08:41:36 +01:00
commit b324ff09ef
276 changed files with 55220 additions and 0 deletions

140
libs/neo/__init__.py Normal file
View File

@@ -0,0 +1,140 @@
from typing import TYPE_CHECKING, Any
import structlog
from .client import Neo4jClient
from .queries import TemporalQueries
from .validator import SHACLValidator
if TYPE_CHECKING:
from libs.schemas.coverage.evaluation import Citation, FoundEvidence
logger = structlog.get_logger()
async def kg_boxes_exist(client: Neo4jClient, box_ids: list[str]) -> dict[str, bool]:
"""Check if form boxes exist in the knowledge graph"""
query = """
UNWIND $box_ids AS bid
OPTIONAL MATCH (fb:FormBox {box_id: bid})
RETURN bid, fb IS NOT NULL AS exists
"""
try:
results = await client.run_query(query, {"box_ids": box_ids})
return {result["bid"]: result["exists"] for result in results}
except Exception as e:
logger.error("Failed to check box existence", box_ids=box_ids, error=str(e))
return dict.fromkeys(box_ids, False)
async def kg_find_evidence(
client: Neo4jClient,
taxpayer_id: str,
tax_year: str,
kinds: list[str],
min_ocr: float = 0.6,
date_window: int = 30,
) -> list["FoundEvidence"]:
"""Find evidence documents for taxpayer in tax year"""
query = """
MATCH (p:TaxpayerProfile {taxpayer_id: $tid})-[:OF_TAX_YEAR]->(y:TaxYear {label: $tax_year})
MATCH (ev:Evidence)-[:DERIVED_FROM]->(d:Document)
WHERE (ev)-[:SUPPORTS]->(p) OR (d)-[:BELONGS_TO]->(p)
AND d.kind IN $kinds
AND date(d.date) >= date(y.start_date) AND date(d.date) <= date(y.end_date)
AND coalesce(ev.ocr_confidence, 0.0) >= $min_ocr
RETURN d.doc_id AS doc_id,
d.kind AS kind,
ev.page AS page,
ev.bbox AS bbox,
ev.ocr_confidence AS ocr_confidence,
ev.extract_confidence AS extract_confidence,
d.date AS date
ORDER BY ev.ocr_confidence DESC
LIMIT 100
"""
try:
results = await client.run_query(
query,
{
"tid": taxpayer_id,
"tax_year": tax_year,
"kinds": kinds,
"min_ocr": min_ocr,
},
)
# Convert to FoundEvidence format
from libs.schemas.coverage.evaluation import FoundEvidence
evidence_list = []
for result in results:
evidence = FoundEvidence(
doc_id=result["doc_id"],
kind=result["kind"],
pages=[result["page"]] if result["page"] else [],
bbox=result["bbox"],
ocr_confidence=result["ocr_confidence"] or 0.0,
extract_confidence=result["extract_confidence"] or 0.0,
date=result["date"],
)
evidence_list.append(evidence)
return evidence_list
except Exception as e:
logger.error(
"Failed to find evidence",
taxpayer_id=taxpayer_id,
tax_year=tax_year,
kinds=kinds,
error=str(e),
)
return []
async def kg_rule_citations(
client: Neo4jClient, schedule_id: str, box_ids: list[str]
) -> list["Citation"]:
"""Get rule citations for schedule and form boxes"""
query = """
MATCH (fb:FormBox)-[:GOVERNED_BY]->(r:Rule)-[:CITES]->(doc:Document)
WHERE fb.box_id IN $box_ids
RETURN r.rule_id AS rule_id,
doc.doc_id AS doc_id,
doc.locator AS locator
LIMIT 10
"""
try:
results = await client.run_query(query, {"box_ids": box_ids})
# Convert to Citation format
from libs.schemas.coverage.evaluation import Citation
citations = []
for result in results:
citation = Citation(
rule_id=result["rule_id"],
doc_id=result["doc_id"],
locator=result["locator"],
)
citations.append(citation)
return citations
except Exception as e:
logger.error(
"Failed to get rule citations",
schedule_id=schedule_id,
box_ids=box_ids,
error=str(e),
)
return []
__all__ = ["Neo4jClient", "TemporalQueries", "SHACLValidator"]

350
libs/neo/client.py Normal file
View File

@@ -0,0 +1,350 @@
"""Neo4j session helpers, Cypher runner with retry, SHACL validator invoker."""
import asyncio
from datetime import datetime
from typing import Any
import structlog
from neo4j import Transaction
from neo4j.exceptions import ServiceUnavailable, TransientError
logger = structlog.get_logger()
class Neo4jClient:
"""Neo4j client with session management and retry logic"""
def __init__(self, driver: Any) -> None:
self.driver = driver
async def __aenter__(self) -> "Neo4jClient":
"""Async context manager entry"""
return self
async def __aexit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
"""Async context manager exit"""
await self.close()
async def close(self) -> None:
"""Close the driver"""
await asyncio.get_event_loop().run_in_executor(None, self.driver.close)
async def run_query(
self,
query: str,
parameters: dict[str, Any] | None = None,
database: str = "neo4j",
max_retries: int = 3,
) -> list[dict[str, Any]]:
"""Run Cypher query with retry logic"""
def _run_query() -> list[dict[str, Any]]:
with self.driver.session(database=database) as session:
result = session.run(query, parameters or {})
return [record.data() for record in result]
for attempt in range(max_retries):
try:
return await asyncio.get_event_loop().run_in_executor(None, _run_query)
except (TransientError, ServiceUnavailable) as e:
if attempt == max_retries - 1:
logger.error(
"Query failed after retries",
query=query[:100],
attempt=attempt + 1,
error=str(e),
)
raise
wait_time = 2**attempt # Exponential backoff
logger.warning(
"Query failed, retrying",
query=query[:100],
attempt=attempt + 1,
wait_time=wait_time,
error=str(e),
)
await asyncio.sleep(wait_time)
except Exception as e:
logger.error(
"Query failed with non-retryable error",
query=query[:100],
error=str(e),
)
raise
# This should never be reached due to the raise statements above
return []
async def run_transaction(
self, transaction_func: Any, database: str = "neo4j", max_retries: int = 3
) -> Any:
"""Run transaction with retry logic"""
def _run_transaction() -> Any:
with self.driver.session(database=database) as session:
return session.execute_write(transaction_func)
for attempt in range(max_retries):
try:
return await asyncio.get_event_loop().run_in_executor(
None, _run_transaction
)
except (TransientError, ServiceUnavailable) as e:
if attempt == max_retries - 1:
logger.error(
"Transaction failed after retries",
attempt=attempt + 1,
error=str(e),
)
raise
wait_time = 2**attempt
logger.warning(
"Transaction failed, retrying",
attempt=attempt + 1,
wait_time=wait_time,
error=str(e),
)
await asyncio.sleep(wait_time)
except Exception as e:
logger.error(
"Transaction failed with non-retryable error", error=str(e)
)
raise
async def create_node(
self, label: str, properties: dict[str, Any], database: str = "neo4j"
) -> dict[str, Any]:
"""Create a node with temporal properties"""
# Add temporal properties if not present
if "asserted_at" not in properties:
properties["asserted_at"] = datetime.utcnow()
query = f"""
CREATE (n:{label} $properties)
RETURN n
"""
result = await self.run_query(query, {"properties": properties}, database)
node = result[0]["n"] if result else {}
# Return node ID if available, otherwise return the full node
return node.get("id", node)
async def update_node(
self,
label: str,
node_id: str,
properties: dict[str, Any],
database: str = "neo4j",
) -> dict[str, Any]:
"""Update node with bitemporal versioning"""
def _update_transaction(tx: Transaction) -> Any:
# First, retract the current version
retract_query = f"""
MATCH (n:{label} {{id: $node_id}})
WHERE n.retracted_at IS NULL
SET n.retracted_at = datetime()
RETURN n
"""
tx.run(retract_query, {"node_id": node_id}) # fmt: skip # pyright: ignore[reportArgumentType]
# Create new version
new_properties = properties.copy()
new_properties["id"] = node_id
new_properties["asserted_at"] = datetime.utcnow()
create_query = f"""
CREATE (n:{label} $properties)
RETURN n
"""
result = tx.run(create_query, {"properties": new_properties}) # fmt: skip # pyright: ignore[reportArgumentType]
record = result.single()
return record["n"] if record else None
result = await self.run_transaction(_update_transaction, database)
return result if isinstance(result, dict) else {}
async def create_relationship( # pylint: disable=too-many-arguments,too-many-positional-arguments
self,
from_label: str | None = None,
from_id: str | None = None,
to_label: str | None = None,
to_id: str | None = None,
relationship_type: str | None = None,
properties: dict[str, Any] | None = None,
database: str = "neo4j",
# Alternative signature for tests
from_node_id: int | None = None,
to_node_id: int | None = None,
) -> dict[str, Any]:
"""Create relationship between nodes"""
# Handle alternative signature for tests (using node IDs)
if from_node_id is not None and to_node_id is not None:
rel_properties = properties or {}
if "asserted_at" not in rel_properties:
rel_properties["asserted_at"] = datetime.utcnow()
query = f"""
MATCH (from) WHERE id(from) = $from_id
MATCH (to) WHERE id(to) = $to_id
CREATE (from)-[r:{relationship_type} $properties]->(to)
RETURN r
"""
result = await self.run_query(
query,
{
"from_id": from_node_id,
"to_id": to_node_id,
"properties": rel_properties,
},
database,
)
rel = result[0]["r"] if result else {}
return rel.get("id", rel)
# Original signature (using labels and IDs)
rel_properties = properties or {}
if "asserted_at" not in rel_properties:
rel_properties["asserted_at"] = datetime.utcnow()
query = f"""
MATCH (from:{from_label} {{id: $from_id}})
MATCH (to:{to_label} {{id: $to_id}})
WHERE from.retracted_at IS NULL AND to.retracted_at IS NULL
CREATE (from)-[r:{relationship_type} $properties]->(to)
RETURN r
"""
result = await self.run_query(
query,
{"from_id": from_id, "to_id": to_id, "properties": rel_properties},
database,
)
rel = result[0]["r"] if result else {}
# Return relationship ID if available, otherwise return the full relationship
return rel.get("id", rel)
async def get_node_lineage(
self, node_id: str, max_depth: int = 10, database: str = "neo4j"
) -> list[dict[str, Any]]:
"""Get complete lineage for a node"""
query = """
MATCH path = (n {id: $node_id})-[:DERIVED_FROM*1..10]->(evidence:Evidence)
WHERE n.retracted_at IS NULL
RETURN path, evidence
ORDER BY length(path) DESC
LIMIT 100
"""
return await self.run_query(
query, {"node_id": node_id, "max_depth": max_depth}, database
)
async def export_to_rdf( # pylint: disable=redefined-builtin
self,
format: str = "turtle",
database: str = "neo4j",
) -> dict[str, Any]:
"""Export graph data to RDF format"""
query = """
CALL n10s.rdf.export.cypher(
'MATCH (n) WHERE n.retracted_at IS NULL RETURN n',
$format,
{}
) YIELD triplesCount, format
RETURN triplesCount, format
"""
try:
result = await self.run_query(query, {"format": format}, database)
return result[0] if result else {}
except Exception as e: # pylint: disable=broad-exception-caught
logger.warning("RDF export failed, using fallback", error=str(e))
fallback_result = await self._export_rdf_fallback(database)
return {"rdf_data": fallback_result, "format": format}
async def _export_rdf_fallback(self, database: str = "neo4j") -> str:
"""Fallback RDF export without n10s plugin"""
# Get all nodes and relationships
nodes_query = """
MATCH (n) WHERE n.retracted_at IS NULL
RETURN labels(n) as labels, properties(n) as props, id(n) as neo_id
"""
rels_query = """
MATCH (a)-[r]->(b)
WHERE a.retracted_at IS NULL AND b.retracted_at IS NULL
RETURN type(r) as type, properties(r) as props,
id(a) as from_id, id(b) as to_id
"""
nodes = await self.run_query(nodes_query, database=database)
relationships = await self.run_query(rels_query, database=database)
# Convert to simple Turtle format
rdf_lines = ["@prefix tax: <https://tax-kg.example.com/> ."]
for node in nodes:
node_uri = f"tax:node_{node['neo_id']}"
for label in node["labels"]:
rdf_lines.append(f"{node_uri} a tax:{label} .")
for prop, value in node["props"].items():
if isinstance(value, str):
rdf_lines.append(f'{node_uri} tax:{prop} "{value}" .')
else:
rdf_lines.append(f"{node_uri} tax:{prop} {value} .")
for rel in relationships:
from_uri = f"tax:node_{rel['from_id']}"
to_uri = f"tax:node_{rel['to_id']}"
rdf_lines.append(f"{from_uri} tax:{rel['type']} {to_uri} .")
return "\n".join(rdf_lines)
async def find_nodes(
self, label: str, properties: dict[str, Any], database: str = "neo4j"
) -> list[dict[str, Any]]:
"""Find nodes matching label and properties"""
where_clause, params = self._build_properties_clause(properties)
query = f"MATCH (n:{label}) WHERE {where_clause} RETURN n"
result = await self.run_query(query, params, database)
return [record["n"] for record in result]
async def execute_query(
self,
query: str,
parameters: dict[str, Any] | None = None,
database: str = "neo4j",
) -> list[dict[str, Any]]:
"""Execute a custom Cypher query"""
return await self.run_query(query, parameters, database)
def _build_properties_clause(
self, properties: dict[str, Any]
) -> tuple[str, dict[str, Any]]:
"""Build WHERE clause and parameters for properties"""
if not properties:
return "true", {}
clauses = []
params = {}
for i, (key, value) in enumerate(properties.items()):
param_name = f"prop_{i}"
clauses.append(f"n.{key} = ${param_name}")
params[param_name] = value
return " AND ".join(clauses), params

78
libs/neo/queries.py Normal file
View File

@@ -0,0 +1,78 @@
"""Neo4j Cypher queries for coverage policy system"""
from datetime import datetime
from typing import Any
import structlog
logger = structlog.get_logger()
class TemporalQueries:
"""Helper class for temporal queries"""
@staticmethod
def get_current_state_query(
label: str, filters: dict[str, Any] | None = None
) -> str:
"""Get query for current state of nodes"""
where_clause = "n.retracted_at IS NULL"
if filters:
filter_conditions = []
for key, value in filters.items():
if isinstance(value, str):
filter_conditions.append(f"n.{key} = '{value}'")
else:
filter_conditions.append(f"n.{key} = {value}")
if filter_conditions:
where_clause += " AND " + " AND ".join(filter_conditions)
return f"""
MATCH (n:{label})
WHERE {where_clause}
RETURN n
ORDER BY n.asserted_at DESC
"""
@staticmethod
def get_historical_state_query(
label: str, as_of_time: datetime, filters: dict[str, Any] | None = None
) -> str:
"""Get query for historical state at specific time"""
where_clause = f"""
n.asserted_at <= datetime('{as_of_time.isoformat()}')
AND (n.retracted_at IS NULL OR n.retracted_at > datetime('{as_of_time.isoformat()}'))
"""
if filters:
filter_conditions = []
for key, value in filters.items():
if isinstance(value, str):
filter_conditions.append(f"n.{key} = '{value}'")
else:
filter_conditions.append(f"n.{key} = {value}")
if filter_conditions:
where_clause += " AND " + " AND ".join(filter_conditions)
return f"""
MATCH (n:{label})
WHERE {where_clause}
RETURN n
ORDER BY n.asserted_at DESC
"""
@staticmethod
def get_audit_trail_query(node_id: str) -> str:
"""Get complete audit trail for a node"""
return f"""
MATCH (n {{id: '{node_id}'}})
RETURN n.asserted_at as asserted_at,
n.retracted_at as retracted_at,
n.source as source,
n.extractor_version as extractor_version,
properties(n) as properties
ORDER BY n.asserted_at ASC
"""

70
libs/neo/validator.py Normal file
View File

@@ -0,0 +1,70 @@
"""SHACL validation using pySHACL"""
import asyncio
from typing import Any
import structlog
logger = structlog.get_logger()
# pyright: ignore[reportAttributeAccessIssue]
class SHACLValidator: # pylint: disable=too-few-public-methods
"""SHACL validation using pySHACL"""
def __init__(self, shapes_file: str) -> None:
self.shapes_file = shapes_file
async def validate_graph(self, rdf_data: str) -> dict[str, Any]:
"""Validate RDF data against SHACL shapes"""
def _validate() -> dict[str, Any]:
try:
# pylint: disable=import-outside-toplevel
from pyshacl import validate
from rdflib import Graph
# Load data graph
data_graph = Graph()
data_graph.parse(data=rdf_data, format="turtle")
# Load shapes graph
shapes_graph = Graph()
shapes_graph.parse(self.shapes_file, format="turtle")
# Run validation
conforms, results_graph, results_text = validate(
data_graph=data_graph,
shacl_graph=shapes_graph,
inference="rdfs",
abort_on_first=False,
allow_infos=True,
allow_warnings=True,
)
return {
"conforms": conforms,
"results_text": results_text,
"violations_count": len(
list(
results_graph.subjects() # pyright: ignore[reportAttributeAccessIssue]
) # fmt: skip # pyright: ignore[reportAttributeAccessIssue]
),
}
except ImportError:
logger.warning("pySHACL not available, skipping validation")
return {
"conforms": True,
"results_text": "SHACL validation skipped (pySHACL not installed)",
"violations_count": 0,
}
except Exception as e: # pylint: disable=broad-exception-caught
logger.error("SHACL validation failed", error=str(e))
return {
"conforms": False,
"results_text": f"Validation error: {str(e)}",
"violations_count": -1,
}
return await asyncio.get_event_loop().run_in_executor(None, _validate)