Initial commit
Some checks failed
CI/CD Pipeline / Code Quality & Linting (push) Has been cancelled
CI/CD Pipeline / Policy Validation (push) Has been cancelled
CI/CD Pipeline / Test Suite (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-firm-connectors) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-forms) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-hmrc) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ingestion) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-normalize-map) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ocr) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-indexer) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-reason) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rpa) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (ui-review) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (ui-review) (push) Has been cancelled
CI/CD Pipeline / Generate SBOM (push) Has been cancelled
CI/CD Pipeline / Deploy to Staging (push) Has been cancelled
CI/CD Pipeline / Deploy to Production (push) Has been cancelled
CI/CD Pipeline / Notifications (push) Has been cancelled

This commit is contained in:
harkon
2025-10-11 08:41:36 +01:00
commit b324ff09ef
276 changed files with 55220 additions and 0 deletions

4
apps/__init__.py Normal file
View File

@@ -0,0 +1,4 @@
# file: /Users/harris/Projects/ai-tax-agent/apps/__init__.py
# hypothesis_version: 6.138.15
[]

View File

@@ -0,0 +1,53 @@
# Multi-stage build for svc-coverage
FROM python:3.12-slim AS builder
# Install build dependencies
RUN apt-get update && apt-get install -y \
build-essential \
curl \
&& rm -rf /var/lib/apt/lists/*
# Create virtual environment
RUN python -m venv /opt/venv
ENV PATH="/opt/venv/bin:$PATH"
# Copy requirements and install dependencies
COPY libs/requirements-base.txt /tmp/libs-requirements.txt
COPY apps/svc_coverage/requirements.txt /tmp/requirements.txt
RUN pip install --no-cache-dir --upgrade pip && \
pip install --no-cache-dir -r /tmp/libs-requirements.txt -r /tmp/requirements.txt
# Production stage
FROM python:3.12-slim
# Install runtime dependencies
RUN apt-get update && apt-get install -y \
curl \
&& rm -rf /var/lib/apt/lists/* \
&& groupadd -r appuser \
&& useradd -r -g appuser appuser
# Copy virtual environment from builder
COPY --from=builder /opt/venv /opt/venv
ENV PATH="/opt/venv/bin:$PATH"
# Set working directory
WORKDIR /app
# Copy application code
COPY libs/ ./libs/
COPY apps/svc_coverage/ ./apps/svc_coverage/
# Create non-root user and set permissions
RUN chown -R appuser:appuser /app
USER appuser
# Health check
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
CMD curl -f http://localhost:8000/healthz || exit 1
# Expose port
EXPOSE 8000
# Run the application
CMD ["python", "-m", "uvicorn", "apps.svc_coverage.main:app", "--host", "0.0.0.0", "--port", "8000"]

View File

@@ -0,0 +1 @@
"""Coverage service package."""

View File

@@ -0,0 +1,112 @@
# A generic, single database configuration.
[alembic]
# path to migration scripts
script_location = alembic
# template used to generate migration file names; The default value is %%(rev)s_%%(slug)s
# Uncomment the line below if you want the files to be prepended with date and time
# file_template = %%(year)d_%%(month).2d_%%(day).2d_%%(hour).2d%%(minute).2d-%%(rev)s_%%(slug)s
# sys.path path, will be prepended to sys.path if present.
# defaults to the current working directory.
prepend_sys_path = .
# timezone to use when rendering the date within the migration file
# as well as the filename.
# If specified, requires the python-dateutil library that can be
# installed by adding `alembic[tz]` to the pip requirements
# string value is passed to dateutil.tz.gettz()
# leave blank for localtime
# timezone =
# max length of characters to apply to the
# "slug" field
# truncate_slug_length = 40
# set to 'true' to run the environment during
# the 'revision' command, regardless of autogenerate
# revision_environment = false
# set to 'true' to allow .pyc and .pyo files without
# a source .py file to be detected as revisions in the
# versions/ directory
# sourceless = false
# version number format
version_num_format = %04d
# version path separator; As mentioned above, this is the character used to split
# version_locations. The default within new alembic.ini files is "os", which uses
# os.pathsep. If this key is omitted entirely, it falls back to the legacy
# behavior of splitting on spaces and/or commas.
# Valid values for version_path_separator are:
#
# version_path_separator = :
# version_path_separator = ;
# version_path_separator = space
version_path_separator = os
# set to 'true' to search source files recursively
# in each "version_locations" directory
# new in Alembic version 1.10
# recursive_version_locations = false
# the output encoding used when revision files
# are written from script.py.mako
# output_encoding = utf-8
sqlalchemy.url = postgresql://user:pass@localhost:5432/coverage
[post_write_hooks]
# post_write_hooks defines scripts or Python functions that are run
# on newly generated revision scripts. See the documentation for further
# detail and examples
# format using "black" - use the console_scripts runner, against the "black" entrypoint
# hooks = black
# black.type = console_scripts
# black.entrypoint = black
# black.options = -l 79 REVISION_SCRIPT_FILENAME
# lint with attempts to fix using "ruff" - use the exec runner, execute a binary
# hooks = ruff
# ruff.type = exec
# ruff.executable = %(here)s/.venv/bin/ruff
# ruff.options = --fix REVISION_SCRIPT_FILENAME
# Logging configuration
[loggers]
keys = root,sqlalchemy,alembic
[handlers]
keys = console
[formatters]
keys = generic
[logger_root]
level = WARN
handlers = console
qualname =
[logger_sqlalchemy]
level = WARN
handlers =
qualname = sqlalchemy.engine
[logger_alembic]
level = INFO
handlers =
qualname = alembic
[handler_console]
class = StreamHandler
args = (sys.stderr,)
level = NOTSET
formatter = generic
[formatter_generic]
format = %(levelname)-5.5s [%(name)s] %(message)s
datefmt = %H:%M:%S

View File

@@ -0,0 +1,92 @@
"""Alembic environment configuration for coverage service."""
import os
import sys
from logging.config import fileConfig
from alembic import context
from sqlalchemy import engine_from_config, pool
# Add the parent directory to the path so we can import our models
sys.path.append(os.path.join(os.path.dirname(__file__), "..", "..", ".."))
# Import your models here
from apps.svc_coverage.models import Base
# this is the Alembic Config object, which provides
# access to the values within the .ini file in use.
config = context.config
# Interpret the config file for Python logging.
# This line sets up loggers basically.
if config.config_file_name is not None:
fileConfig(config.config_file_name)
# add your model's MetaData object here
# for 'autogenerate' support
target_metadata = Base.metadata
# other values from the config, defined by the needs of env.py,
# can be acquired:
# my_important_option = config.get_main_option("my_important_option")
# ... etc.
def get_url():
"""Get database URL from environment or config."""
return os.getenv("DATABASE_URL", config.get_main_option("sqlalchemy.url"))
def run_migrations_offline() -> None:
"""Run migrations in 'offline' mode.
This configures the context with just a URL
and not an Engine, though an Engine is acceptable
here as well. By skipping the Engine creation
we don't even need a DBAPI to be available.
Calls to context.execute() here emit the given string to the
script output.
"""
url = get_url()
context.configure(
url=url,
target_metadata=target_metadata,
literal_binds=True,
dialect_opts={"paramstyle": "named"},
)
with context.begin_transaction():
context.run_migrations()
def run_migrations_online() -> None:
"""Run migrations in 'online' mode.
In this scenario we need to create an Engine
and associate a connection with the context.
"""
configuration = config.get_section(config.config_ini_section)
configuration["sqlalchemy.url"] = get_url()
connectable = engine_from_config(
configuration,
prefix="sqlalchemy.",
poolclass=pool.NullPool,
)
with connectable.connect() as connection:
context.configure(
connection=connection, target_metadata=target_metadata
)
with context.begin_transaction():
context.run_migrations()
if context.is_offline_mode():
run_migrations_offline()
else:
run_migrations_online()

View File

@@ -0,0 +1,24 @@
"""${message}
Revision ID: ${up_revision}
Revises: ${down_revision | comma,n}
Create Date: ${create_date}
"""
from alembic import op
import sqlalchemy as sa
${imports if imports else ""}
# revision identifiers, used by Alembic.
revision = ${repr(up_revision)}
down_revision = ${repr(down_revision)}
branch_labels = ${repr(branch_labels)}
depends_on = ${repr(depends_on)}
def upgrade() -> None:
${upgrades if upgrades else "pass"}
def downgrade() -> None:
${downgrades if downgrades else "pass"}

View File

@@ -0,0 +1,76 @@
"""Initial coverage tables
Revision ID: 0001
Revises:
Create Date: 2024-09-14 12:00:00.000000
"""
from alembic import op
import sqlalchemy as sa
from sqlalchemy.dialects import postgresql
# revision identifiers, used by Alembic.
revision = '0001'
down_revision = None
branch_labels = None
depends_on = None
def upgrade() -> None:
# Create coverage_versions table
op.create_table(
'coverage_versions',
sa.Column('id', sa.Integer(), autoincrement=True, nullable=False),
sa.Column('version', sa.String(length=50), nullable=False),
sa.Column('jurisdiction', sa.String(length=10), nullable=False),
sa.Column('tax_year', sa.String(length=10), nullable=False),
sa.Column('tenant_id', sa.String(length=100), nullable=True),
sa.Column('source_files', postgresql.JSON(astext_type=sa.Text()), nullable=False),
sa.Column('compiled_at', sa.DateTime(), nullable=False),
sa.Column('hash', sa.String(length=64), nullable=False),
sa.PrimaryKeyConstraint('id')
)
# Create indexes for coverage_versions
op.create_index('ix_coverage_versions_version', 'coverage_versions', ['version'])
op.create_index('ix_coverage_versions_jurisdiction_tax_year', 'coverage_versions', ['jurisdiction', 'tax_year'])
op.create_index('ix_coverage_versions_tenant_id', 'coverage_versions', ['tenant_id'])
op.create_index('ix_coverage_versions_hash', 'coverage_versions', ['hash'])
# Create coverage_audit table
op.create_table(
'coverage_audit',
sa.Column('id', sa.Integer(), autoincrement=True, nullable=False),
sa.Column('taxpayer_id', sa.String(length=100), nullable=False),
sa.Column('tax_year', sa.String(length=10), nullable=False),
sa.Column('policy_version', sa.String(length=50), nullable=False),
sa.Column('overall_status', sa.String(length=20), nullable=False),
sa.Column('blocking_items', postgresql.JSON(astext_type=sa.Text()), nullable=False),
sa.Column('created_at', sa.DateTime(), nullable=False),
sa.Column('trace_id', sa.String(length=100), nullable=True),
sa.PrimaryKeyConstraint('id')
)
# Create indexes for coverage_audit
op.create_index('ix_coverage_audit_taxpayer_id', 'coverage_audit', ['taxpayer_id'])
op.create_index('ix_coverage_audit_tax_year', 'coverage_audit', ['tax_year'])
op.create_index('ix_coverage_audit_taxpayer_tax_year', 'coverage_audit', ['taxpayer_id', 'tax_year'])
op.create_index('ix_coverage_audit_created_at', 'coverage_audit', ['created_at'])
op.create_index('ix_coverage_audit_trace_id', 'coverage_audit', ['trace_id'])
def downgrade() -> None:
# Drop coverage_audit table and indexes
op.drop_index('ix_coverage_audit_trace_id', table_name='coverage_audit')
op.drop_index('ix_coverage_audit_created_at', table_name='coverage_audit')
op.drop_index('ix_coverage_audit_taxpayer_tax_year', table_name='coverage_audit')
op.drop_index('ix_coverage_audit_tax_year', table_name='coverage_audit')
op.drop_index('ix_coverage_audit_taxpayer_id', table_name='coverage_audit')
op.drop_table('coverage_audit')
# Drop coverage_versions table and indexes
op.drop_index('ix_coverage_versions_hash', table_name='coverage_versions')
op.drop_index('ix_coverage_versions_tenant_id', table_name='coverage_versions')
op.drop_index('ix_coverage_versions_jurisdiction_tax_year', table_name='coverage_versions')
op.drop_index('ix_coverage_versions_version', table_name='coverage_versions')
op.drop_table('coverage_versions')

523
apps/svc_coverage/main.py Normal file
View File

@@ -0,0 +1,523 @@
# FILE: apps/svc-coverage/main.py
# Coverage policy service with evaluation, clarification, and hot reload
import os
import sys
from typing import Any
import structlog
from fastapi import Depends, HTTPException
from pydantic import BaseModel
sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
from libs.app_factory import create_app
from libs.config import BaseAppSettings, create_event_bus, create_neo4j_client
from libs.coverage import CoverageEvaluator
from libs.events import EventBus
from libs.neo import Neo4jClient
from libs.observability import get_metrics, get_tracer, setup_observability
from libs.policy import PolicyLoader, get_policy_loader
from libs.schemas import (
ClarifyContext,
ClarifyResponse,
CoverageGap,
CoverageReport,
PolicyError,
UploadOption,
ValidationResult,
)
from libs.security import get_current_user, get_tenant_id
logger = structlog.get_logger()
async def http_exception_handler(_request, exc) -> dict[str, str | int]:
"""Handle HTTP exceptions"""
return {"detail": exc.detail, "status_code": exc.status_code}
class CoverageSettings(BaseAppSettings):
"""Settings for Coverage service"""
service_name: str = "svc-coverage"
# Policy configuration
config_dir: str = "config"
policy_reload_enabled: bool = True
# Database
postgres_url: str = "postgresql://user:pass@localhost:5432/coverage"
# External services
rag_service_url: str = "http://svc-rag-retriever:8000"
# Create app and settings
app, settings = create_app(
service_name="svc-coverage",
title="Tax Agent Coverage Policy Service",
description="Coverage policy evaluation and clarification service",
settings_class=CoverageSettings,
)
# Global state
neo4j_client: Neo4jClient | None = None
event_bus: EventBus | None = None
policy_loader: PolicyLoader | None = None
current_policy: Any = None
@app.on_event("startup")
async def startup_event() -> None:
"""Initialize service dependencies"""
global neo4j_client, event_bus, policy_loader, current_policy
# Setup observability
setup_observability(settings)
# Initialize Neo4j client
neo4j_driver = create_neo4j_client(settings)
neo4j_client = Neo4jClient(neo4j_driver)
# Initialize event bus
event_bus = create_event_bus(settings)
# Initialize policy loader
policy_loader = get_policy_loader(settings.config_dir)
# Load initial policy
try:
policy = policy_loader.load_policy()
current_policy = policy_loader.compile_predicates(policy)
logger.info("Initial policy loaded", version=policy.version)
except Exception as e:
logger.error("Failed to load initial policy", error=str(e))
current_policy = None
logger.info("Coverage service started")
@app.on_event("shutdown")
async def shutdown_event() -> None:
"""Cleanup service dependencies"""
global neo4j_client, event_bus
if neo4j_client:
await neo4j_client.close()
if event_bus:
await event_bus.close()
logger.info("Coverage service stopped")
# Request/Response models
class CheckCoverageRequest(BaseModel):
"""Request to check document coverage"""
tax_year: str
taxpayer_id: str
class ClarifyRequest(BaseModel):
"""Request to generate clarifying question"""
gap: CoverageGap
context: ClarifyContext
class ReloadRequest(BaseModel):
"""Request to reload policy"""
force: bool = False
# Metrics
metrics = get_metrics()
tracer = get_tracer()
@app.post("/v1/coverage/check")
async def check_coverage(
request: CheckCoverageRequest,
current_user: dict[str, Any] = Depends(get_current_user),
tenant_id: str = Depends(get_tenant_id),
) -> CoverageReport:
"""Check document coverage for taxpayer"""
with tracer.start_as_current_span("check_coverage") as span:
span.set_attribute("taxpayer_id", request.taxpayer_id)
span.set_attribute("tax_year", request.tax_year)
span.set_attribute("tenant_id", tenant_id)
try:
if not current_policy:
raise HTTPException(status_code=503, detail="Policy not loaded")
# Create evaluator with KG and RAG clients
evaluator = CoverageEvaluator(
kg_client=neo4j_client,
rag_client=None, # TODO: Initialize RAG client
)
# Perform coverage evaluation
report = await evaluator.check_document_coverage(
request.taxpayer_id,
request.tax_year,
current_policy,
)
# Record audit trail
await _record_coverage_audit(report, tenant_id)
# Update metrics
metrics.counter("coverage_checks_total").labels(
tenant_id=tenant_id,
tax_year=request.tax_year,
overall_status=report.overall_status.value,
).inc()
return report
except HTTPException:
# Re-raise HTTP exceptions as-is
raise
except Exception as e:
logger.error(
"Coverage check failed",
taxpayer_id=request.taxpayer_id,
tax_year=request.tax_year,
error=str(e),
)
raise HTTPException(
status_code=500, detail=f"Coverage check failed: {str(e)}"
) from e
@app.post("/v1/coverage/clarify")
async def clarify_gap(
request: ClarifyRequest,
current_user: dict[str, Any] = Depends(get_current_user),
tenant_id: str = Depends(get_tenant_id),
) -> ClarifyResponse:
"""Generate clarifying question for coverage gap"""
with tracer.start_as_current_span("clarify_gap") as span:
span.set_attribute("schedule_id", request.gap.schedule_id)
span.set_attribute("evidence_id", request.gap.evidence_id)
span.set_attribute("tenant_id", tenant_id)
try:
if not current_policy:
raise HTTPException(status_code=503, detail="Policy not loaded")
# Generate clarifying question
response = await _generate_clarifying_question(request.gap, request.context)
# Update metrics
metrics.counter("clarifications_total").labels(
tenant_id=tenant_id,
schedule_id=request.gap.schedule_id,
evidence_id=request.gap.evidence_id,
).inc()
return response
except HTTPException:
# Re-raise HTTP exceptions as-is
raise
except Exception as e:
logger.error(
"Clarification failed",
gap=request.gap.dict(),
error=str(e),
)
raise HTTPException(
status_code=500, detail=f"Clarification failed: {str(e)}"
) from e
@app.post("/admin/coverage/reload")
async def reload_policy(
request: ReloadRequest,
current_user: dict[str, Any] = Depends(get_current_user),
tenant_id: str = Depends(get_tenant_id),
) -> dict[str, Any]:
"""Reload coverage policy from files"""
# Check admin permissions
user_groups = current_user.get("groups", [])
if "admin" not in user_groups:
raise HTTPException(status_code=403, detail="Admin access required")
with tracer.start_as_current_span("reload_policy") as span:
span.set_attribute("tenant_id", tenant_id)
span.set_attribute("force", request.force)
try:
global current_policy
if not policy_loader:
raise HTTPException(
status_code=503, detail="Policy loader not initialized"
)
# Load and compile new policy
policy = policy_loader.load_policy()
new_compiled_policy = policy_loader.compile_predicates(policy)
# Record new policy version
await _record_policy_version(new_compiled_policy, tenant_id)
# Update current policy
current_policy = new_compiled_policy
logger.info(
"Policy reloaded",
version=policy.version,
hash=new_compiled_policy.hash,
tenant_id=tenant_id,
)
return {
"success": True,
"version": policy.version,
"hash": new_compiled_policy.hash,
"compiled_at": new_compiled_policy.compiled_at.isoformat(),
"source_files": new_compiled_policy.source_files,
}
except PolicyError as e:
logger.error("Policy reload failed", error=str(e))
raise HTTPException(
status_code=400, detail=f"Policy error: {str(e)}"
) from e
except Exception as e:
logger.error("Policy reload failed", error=str(e))
raise HTTPException(
status_code=500, detail=f"Reload failed: {str(e)}"
) from e
@app.get("/v1/coverage/policy")
async def get_current_policy(
current_user: dict[str, Any] = Depends(get_current_user),
tenant_id: str = Depends(get_tenant_id),
) -> dict[str, Any]:
"""Get current compiled policy (no secrets, no PII)"""
with tracer.start_as_current_span("get_policy") as span:
span.set_attribute("tenant_id", tenant_id)
if not current_policy:
raise HTTPException(status_code=503, detail="Policy not loaded")
# Return sanitized policy info
return {
"version": current_policy.policy.version,
"jurisdiction": current_policy.policy.jurisdiction,
"tax_year": current_policy.policy.tax_year,
"compiled_at": current_policy.compiled_at.isoformat(),
"hash": current_policy.hash,
"source_files": current_policy.source_files,
"schedules": list(current_policy.policy.schedules.keys()),
"document_kinds": current_policy.policy.document_kinds,
}
@app.get("/v1/coverage/validate")
async def validate_policy(
current_user: dict[str, Any] = Depends(get_current_user),
tenant_id: str = Depends(get_tenant_id),
) -> ValidationResult:
"""Validate current policy configuration"""
with tracer.start_as_current_span("validate_policy") as span:
span.set_attribute("tenant_id", tenant_id)
try:
if not policy_loader:
raise HTTPException(
status_code=503, detail="Policy loader not initialized"
)
# Load policy as dict for validation
policy_dict = policy_loader._load_yaml_file(
os.path.join(settings.config_dir, "coverage.yaml")
)
# Validate policy
result = policy_loader.validate_policy(policy_dict)
# Additional validation: check box existence in KG
if neo4j_client and result.ok:
box_validation_errors = await _validate_boxes_in_kg(policy_dict)
if box_validation_errors:
result.errors.extend(box_validation_errors)
result.ok = False
return result
except Exception as e:
logger.error("Policy validation failed", error=str(e))
return ValidationResult(
ok=False,
errors=[f"Validation failed: {str(e)}"],
)
# Helper functions
async def _record_coverage_audit(report: CoverageReport, tenant_id: str) -> None:
"""Record coverage audit trail"""
# TODO: Implement database recording
logger.info(
"Coverage audit recorded",
taxpayer_id=report.taxpayer_id,
tax_year=report.tax_year,
overall_status=report.overall_status.value,
blocking_items=len(report.blocking_items),
tenant_id=tenant_id,
)
async def _record_policy_version(compiled_policy: Any, tenant_id: str) -> None:
"""Record new policy version"""
# TODO: Implement database recording
logger.info(
"Policy version recorded",
version=compiled_policy.policy.version,
hash=compiled_policy.hash,
tenant_id=tenant_id,
)
async def _generate_clarifying_question(
gap: CoverageGap, context: ClarifyContext
) -> ClarifyResponse:
"""Generate clarifying question for coverage gap"""
if not current_policy:
raise ValueError("Policy not loaded")
# Get question template
templates = current_policy.policy.question_templates
default_template = templates.default
# Build question text
evidence_name = gap.evidence_id
schedule_name = gap.schedule_id
boxes_text = ", ".join(gap.boxes) if gap.boxes else "relevant boxes"
alternatives_text = (
", ".join(gap.acceptable_alternatives)
if gap.acceptable_alternatives
else "alternative documents"
)
question_text = default_template["text"].format(
schedule=schedule_name,
tax_year=context.tax_year,
evidence=evidence_name,
boxes=boxes_text,
alternatives=alternatives_text,
)
why_text = default_template["why"].format(
why=gap.reason,
guidance_doc="policy guidance",
)
# Build upload options
options = []
if gap.acceptable_alternatives:
for alt in gap.acceptable_alternatives:
options.append(
UploadOption(
label=f"Upload {alt} (PDF/CSV)",
accepted_formats=["pdf", "csv"],
upload_endpoint=f"/v1/ingest/upload?tag={alt}",
)
)
else:
options.append(
UploadOption(
label=f"Upload {evidence_name} (PDF/CSV)",
accepted_formats=["pdf", "csv"],
upload_endpoint=f"/v1/ingest/upload?tag={evidence_name}",
)
)
return ClarifyResponse(
question_text=question_text,
why_it_is_needed=why_text,
citations=gap.citations,
options_to_provide=options,
blocking=(gap.role.value == "REQUIRED"),
boxes_affected=gap.boxes,
)
async def _validate_boxes_in_kg(policy_dict: dict[str, Any]) -> list[str]:
"""Validate that all referenced boxes exist in KG"""
if not neo4j_client:
return ["KG client not available for box validation"]
errors = []
all_boxes = set()
# Collect all box references
for schedule in policy_dict.get("schedules", {}).values():
for evidence in schedule.get("evidence", []):
all_boxes.update(evidence.get("boxes", []))
if all_boxes:
try:
from libs.neo import kg_boxes_exist
box_existence = await kg_boxes_exist(neo4j_client, list(all_boxes))
for box_id, exists in box_existence.items():
if not exists:
errors.append(f"Form box '{box_id}' not found in knowledge graph")
except Exception as e:
errors.append(f"Failed to validate boxes in KG: {str(e)}")
return errors
# Health check endpoints
@app.get("/healthz")
async def health_check() -> dict[str, str]:
"""Health check endpoint"""
return {"status": "healthy", "service": "svc-coverage"}
@app.get("/readyz")
async def readiness_check() -> dict[str, str]:
"""Readiness check endpoint"""
return {"status": "ready", "service": "svc-coverage"}
@app.get("/livez")
async def liveness_check() -> dict[str, str]:
"""Liveness check endpoint"""
return {"status": "alive", "service": "svc-coverage"}
# Metrics endpoint (internal only)
@app.get("/metrics")
async def get_metrics_endpoint() -> str:
"""Prometheus metrics endpoint"""
# This would return Prometheus format metrics
return "# Coverage service metrics\n"
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8000)

View File

@@ -0,0 +1,46 @@
"""Database models for coverage service."""
# FILE: apps/svc-coverage/models.py
from datetime import datetime
from sqlalchemy import JSON, Column, DateTime, Integer, String
from sqlalchemy.ext.declarative import declarative_base
Base = declarative_base()
class CoverageVersion(Base):
"""Policy version tracking table"""
__tablename__ = "coverage_versions"
id = Column(Integer, primary_key=True, autoincrement=True)
version = Column(String(50), nullable=False)
jurisdiction = Column(String(10), nullable=False)
tax_year = Column(String(10), nullable=False)
tenant_id = Column(String(100), nullable=True)
source_files = Column(JSON, nullable=False, default=list)
compiled_at = Column(DateTime, nullable=False, default=datetime.utcnow)
hash = Column(String(64), nullable=False)
def __repr__(self) -> str:
return f"<CoverageVersion(id={self.id}, version='{self.version}', hash='{self.hash[:8]}...')>"
class CoverageAudit(Base):
"""Coverage evaluation audit trail"""
__tablename__ = "coverage_audit"
id = Column(Integer, primary_key=True, autoincrement=True)
taxpayer_id = Column(String(100), nullable=False)
tax_year = Column(String(10), nullable=False)
policy_version = Column(String(50), nullable=False)
overall_status = Column(String(20), nullable=False)
blocking_items = Column(JSON, nullable=False, default=list)
created_at = Column(DateTime, nullable=False, default=datetime.utcnow)
trace_id = Column(String(100), nullable=True)
def __repr__(self) -> str:
return f"<CoverageAudit(id={self.id}, taxpayer_id='{self.taxpayer_id}', status='{self.overall_status}')>"

View File

@@ -0,0 +1,53 @@
# Multi-stage build for svc-extract
FROM python:3.12-slim AS builder
# Install build dependencies
RUN apt-get update && apt-get install -y \
build-essential \
curl \
&& rm -rf /var/lib/apt/lists/*
# Create virtual environment
RUN python -m venv /opt/venv
ENV PATH="/opt/venv/bin:$PATH"
# Copy requirements and install dependencies
COPY libs/requirements-base.txt /tmp/libs-requirements.txt
COPY apps/svc_extract/requirements.txt /tmp/requirements.txt
RUN pip install --no-cache-dir --upgrade pip && \
pip install --no-cache-dir -r /tmp/libs-requirements.txt -r /tmp/requirements.txt
# Production stage
FROM python:3.12-slim
# Install runtime dependencies
RUN apt-get update && apt-get install -y \
curl \
&& rm -rf /var/lib/apt/lists/* \
&& groupadd -r appuser \
&& useradd -r -g appuser appuser
# Copy virtual environment from builder
COPY --from=builder /opt/venv /opt/venv
ENV PATH="/opt/venv/bin:$PATH"
# Set working directory
WORKDIR /app
# Copy application code
COPY libs/ ./libs/
COPY apps/svc_extract/ ./apps/svc_extract/
# Create non-root user and set permissions
RUN chown -R appuser:appuser /app
USER appuser
# Health check
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
CMD curl -f http://localhost:8000/healthz || exit 1
# Expose port
EXPOSE 8000
# Run the application
CMD ["python", "-m", "uvicorn", "apps.svc_extract.main:app", "--host", "0.0.0.0", "--port", "8000"]

625
apps/svc_extract/main.py Normal file
View File

@@ -0,0 +1,625 @@
"""LLM-based field extraction with confidence scoring and provenance tracking."""
# FILE: apps/svc-extract/main.py
# pylint: disable=wrong-import-position,import-error,too-few-public-methods,global-statement
# pylint: disable=global-variable-not-assigned,raise-missing-from,unused-argument
# pylint: disable=broad-exception-caught,no-else-return,too-many-arguments,too-many-positional-arguments
# pylint: disable=too-many-locals,import-outside-toplevel
import os
# Import shared libraries
import sys
from datetime import datetime
from typing import Any
import structlog
import ulid
from fastapi import BackgroundTasks, Depends, HTTPException, Request
from fastapi.responses import JSONResponse
sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
from libs.app_factory import create_app
from libs.calibration import ConfidenceCalibrator
from libs.config import BaseAppSettings, create_event_bus, create_minio_client
from libs.events import EventBus, EventPayload, EventTopics
from libs.observability import get_metrics, get_tracer, setup_observability
from libs.schemas import ErrorResponse, ExtractionRequest, ExtractionResponse
from libs.security import (
create_trusted_proxy_middleware,
get_current_user,
get_tenant_id,
)
from libs.storage import DocumentStorage, StorageClient
logger = structlog.get_logger()
class ExtractionSettings(BaseAppSettings):
"""Settings for extraction service"""
service_name: str = "svc-extract"
# LLM configuration
openai_api_key: str = ""
model_name: str = "gpt-4"
max_tokens: int = 2000
temperature: float = 0.1
# Extraction configuration
confidence_threshold: float = 0.7
max_retries: int = 3
chunk_size: int = 4000
# Prompt templates
extraction_prompt_template: str = """
Extract the following fields from this document text:
{field_definitions}
Document text:
{document_text}
Return a JSON object with the extracted fields and confidence scores.
"""
# Create app and settings
app, settings = create_app(
service_name="svc-extract",
title="Tax Agent Extraction Service",
description="LLM-based field extraction service",
settings_class=ExtractionSettings,
)
# Add middleware
middleware_factory = create_trusted_proxy_middleware(settings.internal_cidrs)
app.add_middleware(middleware_factory)
# Global clients
storage_client: StorageClient | None = None
document_storage: DocumentStorage | None = None
event_bus: EventBus | None = None
confidence_calibrator: ConfidenceCalibrator | None = None
tracer = get_tracer("svc-extract")
metrics = get_metrics()
@app.on_event("startup")
async def startup_event() -> None:
"""Initialize service dependencies"""
global storage_client, document_storage, event_bus, confidence_calibrator
logger.info("Starting extraction service")
# Setup observability
setup_observability(settings)
# Initialize MinIO client
minio_client = create_minio_client(settings)
storage_client = StorageClient(minio_client)
document_storage = DocumentStorage(storage_client)
# Initialize event bus
event_bus = create_event_bus(settings)
if not event_bus:
raise Exception("Event bus not initialized")
await event_bus.start()
# Subscribe to OCR completion events
await event_bus.subscribe(EventTopics.DOC_OCR_READY, _handle_ocr_ready)
# Initialize confidence calibrator
confidence_calibrator = ConfidenceCalibrator(method="temperature")
logger.info("Extraction service started successfully")
@app.on_event("shutdown")
async def shutdown_event() -> None:
"""Cleanup service dependencies"""
global event_bus
logger.info("Shutting down extraction service")
if event_bus:
await event_bus.stop()
logger.info("Extraction service shutdown complete")
@app.get("/healthz")
async def health_check() -> dict[str, Any]:
"""Health check endpoint"""
return {
"status": "healthy",
"service": settings.service_name,
"version": settings.service_version,
"timestamp": datetime.utcnow().isoformat(),
}
@app.get("/readyz")
async def readiness_check() -> dict[str, Any]:
"""Readiness check endpoint"""
return {
"status": "ready",
"service": settings.service_name,
"version": settings.service_version,
"timestamp": datetime.utcnow().isoformat(),
}
@app.get("/livez")
async def liveness_check() -> dict[str, Any]:
"""Liveness check endpoint"""
return {
"status": "alive",
"service": settings.service_name,
"version": settings.service_version,
"timestamp": datetime.utcnow().isoformat(),
}
@app.post("/extract/{doc_id}", response_model=ExtractionResponse)
async def extract_fields(
doc_id: str,
request_data: ExtractionRequest,
background_tasks: BackgroundTasks,
current_user: dict[str, Any] = Depends(get_current_user()),
tenant_id: str = Depends(get_tenant_id()),
) -> ExtractionResponse:
"""Extract fields from document"""
with tracer.start_as_current_span("extract_fields") as span:
span.set_attribute("doc_id", doc_id)
span.set_attribute("tenant_id", tenant_id)
span.set_attribute("strategy", request_data.strategy)
try:
# Check if OCR results exist
ocr_results = (
await document_storage.get_ocr_result(tenant_id, doc_id)
if document_storage
else None
)
if not ocr_results:
raise HTTPException(status_code=404, detail="OCR results not found")
# Generate extraction ID
extraction_id = str(ulid.new())
span.set_attribute("extraction_id", extraction_id)
# Start background extraction
background_tasks.add_task(
_extract_fields_async,
doc_id,
tenant_id,
ocr_results,
request_data.strategy,
extraction_id,
current_user.get("sub", "system"),
)
logger.info(
"Field extraction started", doc_id=doc_id, extraction_id=extraction_id
)
return ExtractionResponse(
extraction_id=extraction_id,
confidence=0.0, # Will be updated when processing completes
extracted_fields={},
provenance=[],
)
except HTTPException:
raise
except Exception as e:
logger.error("Failed to start extraction", doc_id=doc_id, error=str(e))
raise HTTPException(status_code=500, detail="Failed to start extraction")
@app.get("/results/{doc_id}")
async def get_extraction_results(
doc_id: str,
current_user: dict[str, Any] = Depends(get_current_user()),
tenant_id: str = Depends(get_tenant_id()),
) -> ExtractionResponse:
"""Get extraction results for document"""
with tracer.start_as_current_span("get_extraction_results") as span:
span.set_attribute("doc_id", doc_id)
span.set_attribute("tenant_id", tenant_id)
try:
# Get extraction results from storage
extraction_results = (
await document_storage.get_extraction_result(tenant_id, doc_id)
if document_storage
else None
)
if not extraction_results:
raise HTTPException(
status_code=404, detail="Extraction results not found"
)
# pylint: disable-next=not-a-mapping
return ExtractionResponse(**extraction_results)
except HTTPException:
raise
except Exception as e:
logger.error(
"Failed to get extraction results", doc_id=doc_id, error=str(e)
)
raise HTTPException(
status_code=500, detail="Failed to get extraction results"
)
async def _handle_ocr_ready(topic: str, payload: EventPayload) -> None:
"""Handle OCR completion events"""
try:
data = payload.data
doc_id = data.get("doc_id")
tenant_id = data.get("tenant_id")
if not doc_id or not tenant_id:
logger.warning("Invalid OCR ready event", data=data)
return
logger.info("Auto-extracting fields from OCR results", doc_id=doc_id)
# Get OCR results
ocr_results = data.get("ocr_results")
if not ocr_results:
ocr_results = (
await document_storage.get_ocr_result(tenant_id, doc_id)
if document_storage
else None
)
if ocr_results:
await _extract_fields_async(
doc_id=doc_id,
tenant_id=tenant_id,
ocr_results=ocr_results,
strategy="hybrid",
extraction_id=str(ulid.new()),
actor=payload.actor,
)
except Exception as e:
logger.error("Failed to handle OCR ready event", error=str(e))
async def _extract_fields_async(
doc_id: str,
tenant_id: str,
ocr_results: dict[str, Any],
strategy: str,
extraction_id: str,
actor: str,
) -> None:
"""Extract fields asynchronously"""
with tracer.start_as_current_span("extract_fields_async") as span:
span.set_attribute("doc_id", doc_id)
span.set_attribute("extraction_id", extraction_id)
span.set_attribute("strategy", strategy)
try:
# Extract text from OCR results
document_text = _extract_text_from_ocr(ocr_results)
# Determine field definitions based on document type
field_definitions = _get_field_definitions(doc_id, document_text)
# Perform extraction
if strategy == "llm":
extracted_fields, confidence, provenance = await _extract_with_llm(
document_text, field_definitions, ocr_results
)
elif strategy == "rules":
extracted_fields, confidence, provenance = await _extract_with_rules(
document_text, field_definitions, ocr_results
)
elif strategy == "hybrid":
# Combine LLM and rules-based extraction
llm_fields, llm_conf, llm_prov = await _extract_with_llm(
document_text, field_definitions, ocr_results
)
rules_fields, rules_conf, rules_prov = await _extract_with_rules(
document_text, field_definitions, ocr_results
)
extracted_fields, confidence, provenance = _merge_extractions(
llm_fields, llm_conf, llm_prov, rules_fields, rules_conf, rules_prov
)
else:
raise ValueError(f"Unknown strategy: {strategy}")
# Calibrate confidence
if confidence_calibrator and confidence_calibrator.is_fitted:
calibrated_confidence = confidence_calibrator.calibrate([confidence])[0]
else:
calibrated_confidence = confidence
# Create extraction results
extraction_results = {
"doc_id": doc_id,
"extraction_id": extraction_id,
"strategy": strategy,
"extracted_at": datetime.utcnow().isoformat(),
"confidence": calibrated_confidence,
"raw_confidence": confidence,
"extracted_fields": extracted_fields,
"provenance": provenance,
"field_count": len(extracted_fields),
}
# Store results
if document_storage:
await document_storage.store_extraction_result(
tenant_id, doc_id, extraction_results
)
# Update metrics
metrics.counter("extractions_completed_total").labels(
tenant_id=tenant_id, strategy=strategy
).inc()
metrics.histogram("extraction_confidence").labels(
strategy=strategy
).observe(calibrated_confidence)
# Publish completion event
event_payload = EventPayload(
data={
"doc_id": doc_id,
"tenant_id": tenant_id,
"extraction_id": extraction_id,
"strategy": strategy,
"confidence": calibrated_confidence,
"field_count": len(extracted_fields),
"extraction_results": extraction_results,
},
actor=actor,
tenant_id=tenant_id,
)
if event_bus:
await event_bus.publish(EventTopics.DOC_EXTRACTED, event_payload)
logger.info(
"Field extraction completed",
doc_id=doc_id,
fields=len(extracted_fields),
confidence=calibrated_confidence,
)
except Exception as e:
logger.error("Field extraction failed", doc_id=doc_id, error=str(e))
# Update error metrics
metrics.counter("extraction_errors_total").labels(
tenant_id=tenant_id, strategy=strategy, error_type=type(e).__name__
).inc()
def _extract_text_from_ocr(ocr_results: dict[str, Any]) -> str:
"""Extract text from OCR results"""
text_parts = []
for page in ocr_results.get("pages", []):
if "text" in page:
text_parts.append(page["text"])
elif "tesseract" in page and "text" in page["tesseract"]:
text_parts.append(page["tesseract"]["text"])
return "\n\n".join(text_parts)
def _get_field_definitions(doc_id: str, document_text: str) -> dict[str, str]:
"""Get field definitions based on document type"""
# Analyze document text to determine type
text_lower = document_text.lower()
if "invoice" in text_lower or "bill" in text_lower:
return {
"invoice_number": "Invoice or bill number",
"date": "Invoice date",
"supplier_name": "Supplier or vendor name",
"total_amount": "Total amount including VAT",
"net_amount": "Net amount excluding VAT",
"vat_amount": "VAT amount",
"description": "Description of goods or services",
}
elif "bank statement" in text_lower or "account statement" in text_lower:
return {
"account_number": "Bank account number",
"sort_code": "Bank sort code",
"statement_period": "Statement period",
"opening_balance": "Opening balance",
"closing_balance": "Closing balance",
"transactions": "List of transactions",
}
elif "receipt" in text_lower:
return {
"merchant_name": "Merchant or store name",
"date": "Receipt date",
"total_amount": "Total amount paid",
"payment_method": "Payment method used",
"items": "List of items purchased",
}
else:
# Generic fields
return {
"date": "Any dates mentioned",
"amount": "Any monetary amounts",
"names": "Any person or company names",
"addresses": "Any addresses",
"reference_numbers": "Any reference or account numbers",
}
async def _extract_with_llm(
document_text: str, field_definitions: dict[str, str], ocr_results: dict[str, Any]
) -> tuple[dict[str, Any], float, list[dict[str, Any]]]:
"""Extract fields using LLM"""
try:
# This would integrate with OpenAI API
# For now, return mock extraction
logger.warning("LLM extraction not implemented, using mock data")
extracted_fields = {}
provenance = []
# Mock extraction based on field definitions
for field_name, _field_desc in field_definitions.items():
if "amount" in field_name.lower():
extracted_fields[field_name] = "£1,234.56"
elif "date" in field_name.lower():
extracted_fields[field_name] = "2024-01-15"
elif "name" in field_name.lower():
extracted_fields[field_name] = "Example Company Ltd"
else:
extracted_fields[field_name] = f"Mock {field_name}"
# Add provenance
provenance.append(
{
"field": field_name,
"value": extracted_fields[field_name],
"confidence": 0.8,
"source": "llm",
"page": 1,
"bbox": [100, 100, 200, 120],
}
)
return extracted_fields, 0.8, provenance
except Exception as e:
logger.error("LLM extraction failed", error=str(e))
return {}, 0.0, []
async def _extract_with_rules(
document_text: str, field_definitions: dict[str, str], ocr_results: dict[str, Any]
) -> tuple[dict[str, Any], float, list[dict[str, Any]]]:
"""Extract fields using rules-based approach"""
import re
extracted_fields = {}
provenance = []
# Define extraction patterns
patterns = {
"amount": r"£\d{1,3}(?:,\d{3})*(?:\.\d{2})?",
"date": r"\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b",
"invoice_number": r"(?:invoice|inv|bill)\s*#?\s*(\w+)",
"account_number": r"\b\d{8}\b",
"sort_code": r"\b\d{2}-\d{2}-\d{2}\b",
}
for field_name, _field_desc in field_definitions.items():
# Find matching pattern
pattern_key = None
for key in patterns:
if key in field_name.lower():
pattern_key = key
break
if pattern_key:
pattern = patterns[pattern_key]
matches = re.finditer(pattern, document_text, re.IGNORECASE)
for match in matches:
value = match.group(1) if match.groups() else match.group(0)
extracted_fields[field_name] = value
provenance.append(
{
"field": field_name,
"value": value,
"confidence": 0.9,
"source": "rules",
"pattern": pattern,
"match_start": match.start(),
"match_end": match.end(),
}
)
break # Take first match
confidence = 0.9 if extracted_fields else 0.0
return extracted_fields, confidence, provenance
def _merge_extractions(
llm_fields: dict[str, Any],
llm_conf: float,
llm_prov: list[dict[str, Any]],
rules_fields: dict[str, Any],
rules_conf: float,
rules_prov: list[dict[str, Any]],
) -> tuple[dict[str, Any], float, list[dict[str, Any]]]:
"""Merge LLM and rules-based extractions"""
merged_fields = {}
merged_provenance = []
# Get all field names
all_fields = set(llm_fields.keys()) | set(rules_fields.keys())
for field in all_fields:
llm_value = llm_fields.get(field)
rules_value = rules_fields.get(field)
# Prefer rules-based extraction for structured fields
if rules_value and field in ["amount", "date", "account_number", "sort_code"]:
merged_fields[field] = rules_value
# Find provenance for this field
for prov in rules_prov:
if prov["field"] == field:
merged_provenance.append(prov)
break
elif llm_value:
merged_fields[field] = llm_value
# Find provenance for this field
for prov in llm_prov:
if prov["field"] == field:
merged_provenance.append(prov)
break
# Calculate combined confidence
combined_confidence = (llm_conf + rules_conf) / 2
return merged_fields, combined_confidence, merged_provenance
@app.exception_handler(HTTPException)
async def http_exception_handler(request: Request, exc: HTTPException) -> JSONResponse:
"""Handle HTTP exceptions with RFC7807 format"""
return JSONResponse(
status_code=exc.status_code,
content=ErrorResponse(
type=f"https://httpstatuses.com/{exc.status_code}",
title=exc.detail,
status=exc.status_code,
detail=exc.detail,
instance=str(request.url),
trace_id=getattr(request.state, "trace_id", None),
).model_dump(),
)
if __name__ == "__main__":
import uvicorn
uvicorn.run("main:app", host="0.0.0.0", port=8003, reload=True, log_config=None)

View File

@@ -0,0 +1,17 @@
# Service-specific dependencies for svc_extract
# LLM integration
openai>=1.3.0
anthropic>=0.7.0
# JSON schema validation
jsonschema>=4.20.0
# Template processing
jinja2>=3.1.0
# Text similarity (lightweight)
fuzzywuzzy>=0.18.0
python-Levenshtein>=0.23.0
# Data validation
cerberus>=1.3.4

View File

@@ -0,0 +1,53 @@
# Multi-stage build for svc_firm_connectors
FROM python:3.12-slim AS builder
# Install build dependencies
RUN apt-get update && apt-get install -y \
build-essential \
curl \
&& rm -rf /var/lib/apt/lists/*
# Create virtual environment
RUN python -m venv /opt/venv
ENV PATH="/opt/venv/bin:$PATH"
# Copy requirements and install dependencies
COPY libs/requirements-base.txt /tmp/libs-requirements.txt
COPY apps/svc_firm_connectors/requirements.txt /tmp/requirements.txt
RUN pip install --no-cache-dir --upgrade pip && \
pip install --no-cache-dir -r /tmp/libs-requirements.txt -r /tmp/requirements.txt
# Production stage
FROM python:3.12-slim
# Install runtime dependencies
RUN apt-get update && apt-get install -y \
curl \
&& rm -rf /var/lib/apt/lists/* \
&& groupadd -r appuser \
&& useradd -r -g appuser appuser
# Copy virtual environment from builder
COPY --from=builder /opt/venv /opt/venv
ENV PATH="/opt/venv/bin:$PATH"
# Set working directory
WORKDIR /app
# Copy application code
COPY libs/ ./libs/
COPY apps/svc_firm_connectors/ ./apps/svc_firm_connectors/
# Create non-root user and set permissions
RUN chown -R appuser:appuser /app
USER appuser
# Health check
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
CMD curl -f http://localhost:8000/healthz || exit 1
# Expose port
EXPOSE 8000
# Run the application
CMD ["python", "-m", "uvicorn", "apps.svc_firm_connectors.main:app", "--host", "0.0.0.0", "--port", "8000"]

View File

@@ -0,0 +1,762 @@
# FILE: apps/svc-firm-connectors/main.py
# mypy: disable-error-code=union-attr
# Firm database integration with practice management systems
import asyncio
import json
import os
# Import shared libraries
import sys
from datetime import datetime
from typing import Any
import structlog
import ulid
from fastapi import BackgroundTasks, Depends, HTTPException, Request
from fastapi.responses import JSONResponse
sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
from libs.app_factory import create_app
from libs.config import (
BaseAppSettings,
create_event_bus,
create_neo4j_client,
create_vault_client,
)
from libs.events import EventBus, EventPayload, EventTopics
from libs.neo import Neo4jClient
from libs.observability import get_metrics, get_tracer, setup_observability
from libs.schemas import ErrorResponse, FirmSyncRequest, FirmSyncResponse
from libs.security import VaultTransitHelper, get_current_user, get_tenant_id
logger = structlog.get_logger()
class FirmConnectorsSettings(BaseAppSettings):
"""Settings for firm connectors service"""
service_name: str = "svc-firm-connectors"
# Supported practice management systems
supported_systems: list[str] = [
"iris",
"sage",
"xero",
"quickbooks",
"freeagent",
"kashflow",
]
# Sync configuration
sync_batch_size: int = 100
max_sync_retries: int = 3
sync_timeout: int = 300 # 5 minutes
# Rate limiting
api_rate_limit: int = 100 # requests per minute
# Data mapping
field_mappings_dir: str = "config/firm_mappings"
# Create app and settings
app, settings = create_app(
service_name="svc-firm-connectors",
title="Tax Agent Firm Connectors Service",
description="Practice management system integration",
settings_class=FirmConnectorsSettings,
)
# Global clients
vault_helper: VaultTransitHelper | None = None
neo4j_client: Neo4jClient | None = None
event_bus: EventBus | None = None
tracer = get_tracer("svc-firm-connectors")
metrics = get_metrics()
@app.on_event("startup")
async def startup_event() -> None:
"""Initialize service dependencies"""
global vault_helper, neo4j_client, event_bus
logger.info("Starting firm connectors service")
# Setup observability
setup_observability(settings)
# Initialize Vault helper
vault_client = create_vault_client(settings)
vault_helper = VaultTransitHelper(vault_client, "tax-agent-transit")
# Initialize Neo4j client
neo4j_driver = create_neo4j_client(settings)
neo4j_client = Neo4jClient(neo4j_driver)
# Initialize event bus
event_bus = create_event_bus(settings)
await event_bus.start() # fmt: skip # pyright: ignore[reportOptionalMemberAccess]
logger.info("Firm connectors service started successfully")
@app.on_event("shutdown")
async def shutdown_event() -> None:
"""Cleanup service dependencies"""
global neo4j_client, event_bus
logger.info("Shutting down firm connectors service")
if neo4j_client:
await neo4j_client.close()
if event_bus:
await event_bus.stop()
logger.info("Firm connectors service shutdown complete")
@app.get("/health")
async def health_check() -> dict[str, Any]:
"""Health check endpoint"""
return {
"status": "healthy",
"service": settings.service_name,
"version": settings.service_version,
"timestamp": datetime.utcnow().isoformat(),
"supported_systems": settings.supported_systems,
}
@app.post("/sync", response_model=FirmSyncResponse)
async def sync_firm_data(
request_data: FirmSyncRequest,
background_tasks: BackgroundTasks,
current_user: dict[str, Any] = Depends(get_current_user),
tenant_id: str = Depends(get_tenant_id),
) -> FirmSyncResponse:
"""Sync data from practice management system"""
with tracer.start_as_current_span("sync_firm_data") as span:
span.set_attribute("system", request_data.system)
span.set_attribute("tenant_id", tenant_id)
span.set_attribute("sync_type", request_data.sync_type)
try:
# Validate system
if request_data.system not in settings.supported_systems:
raise HTTPException(
status_code=400, detail=f"Unsupported system: {request_data.system}"
)
# Generate sync ID
sync_id = str(ulid.new())
span.set_attribute("sync_id", sync_id)
# Start background sync
background_tasks.add_task(
_sync_firm_data_async,
request_data.system,
request_data.sync_type,
request_data.connection_config,
tenant_id,
sync_id,
current_user.get("sub", "system"),
)
logger.info(
"Firm data sync started",
sync_id=sync_id,
system=request_data.system,
sync_type=request_data.sync_type,
)
return FirmSyncResponse(
firm_id=request_data.firm_id,
status="syncing",
message=f"Sync started with ID: {sync_id}",
synced_entities=0,
errors=[],
)
except HTTPException:
raise
except Exception as e:
logger.error("Failed to start firm sync", error=str(e))
raise HTTPException(status_code=500, detail="Failed to start firm sync")
@app.get("/sync/{sync_id}")
async def get_sync_status(
sync_id: str,
current_user: dict[str, Any] = Depends(get_current_user),
tenant_id: str = Depends(get_tenant_id),
) -> dict[str, Any]:
"""Get sync status"""
with tracer.start_as_current_span("get_sync_status") as span:
span.set_attribute("sync_id", sync_id)
span.set_attribute("tenant_id", tenant_id)
try:
# Get sync record from Neo4j
query = """
MATCH (s:FirmSync {sync_id: $sync_id, tenant_id: $tenant_id})
WHERE s.retracted_at IS NULL
RETURN s
"""
results = await neo4j_client.run_query( # pyright: ignore[reportOptionalMemberAccess]
query, {"sync_id": sync_id, "tenant_id": tenant_id}
)
if not results:
raise HTTPException(status_code=404, detail="Sync not found")
sync_record = results[0]["s"]
return {
"sync_id": sync_id,
"system": sync_record.get("system"),
"status": sync_record.get("status"),
"records_synced": sync_record.get("records_synced", 0),
"total_records": sync_record.get("total_records", 0),
"started_at": sync_record.get("started_at"),
"completed_at": sync_record.get("completed_at"),
"errors": json.loads(sync_record.get("errors", "[]")),
}
except HTTPException:
raise
except Exception as e:
logger.error("Failed to get sync status", sync_id=sync_id, error=str(e))
raise HTTPException(status_code=500, detail="Failed to get sync status")
@app.post("/connections/{system}/test")
async def test_connection(
system: str,
connection_config: dict[str, Any],
current_user: dict[str, Any] = Depends(get_current_user),
tenant_id: str = Depends(get_tenant_id),
) -> dict[str, Any]:
"""Test connection to practice management system"""
with tracer.start_as_current_span("test_connection") as span:
span.set_attribute("system", system)
span.set_attribute("tenant_id", tenant_id)
try:
# Validate system
if system not in settings.supported_systems:
raise HTTPException(
status_code=400, detail=f"Unsupported system: {system}"
)
# Test connection based on system
if system == "iris":
result = await _test_iris_connection(connection_config)
elif system == "sage":
result = await _test_sage_connection(connection_config)
elif system == "xero":
result = await _test_xero_connection(connection_config)
elif system == "quickbooks":
result = await _test_quickbooks_connection(connection_config)
elif system == "freeagent":
result = await _test_freeagent_connection(connection_config)
elif system == "kashflow":
result = await _test_kashflow_connection(connection_config)
else:
raise HTTPException(
status_code=400,
detail=f"Connection test not implemented for {system}",
)
return {
"system": system,
"connection_status": result["status"],
"message": result["message"],
"capabilities": result.get("capabilities", []),
"test_timestamp": datetime.utcnow().isoformat(),
}
except HTTPException:
raise
except Exception as e:
logger.error("Connection test failed", system=system, error=str(e))
raise HTTPException(
status_code=500, detail=f"Connection test failed: {str(e)}"
)
@app.get("/systems")
async def list_supported_systems(
current_user: dict[str, Any] = Depends(get_current_user),
tenant_id: str = Depends(get_tenant_id),
) -> dict[str, Any]:
"""List supported practice management systems"""
try:
systems_info: list[Any] = []
for system in settings.supported_systems:
system_info = {
"system": system,
"name": _get_system_name(system),
"capabilities": _get_system_capabilities(system),
"connection_fields": _get_connection_fields(system),
}
systems_info.append(system_info)
return {"supported_systems": systems_info, "total_systems": len(systems_info)}
except Exception as e:
logger.error("Failed to list systems", error=str(e))
raise HTTPException(status_code=500, detail="Failed to list systems")
async def _sync_firm_data_async(
system: str,
sync_type: str,
connection_config: dict[str, Any],
tenant_id: str,
sync_id: str,
actor: str,
) -> None:
"""Sync firm data asynchronously"""
with tracer.start_as_current_span("sync_firm_data_async") as span:
span.set_attribute("sync_id", sync_id)
span.set_attribute("system", system)
span.set_attribute("sync_type", sync_type)
try:
# Create sync record
await _create_sync_record(sync_id, system, sync_type, tenant_id)
# Perform sync based on system
if system == "iris":
sync_result = await _sync_iris_data(
connection_config, sync_type, tenant_id
)
elif system == "sage":
sync_result = await _sync_sage_data(
connection_config, sync_type, tenant_id
)
elif system == "xero":
sync_result = await _sync_xero_data(
connection_config, sync_type, tenant_id
)
elif system == "quickbooks":
sync_result = await _sync_quickbooks_data(
connection_config, sync_type, tenant_id
)
elif system == "freeagent":
sync_result = await _sync_freeagent_data(
connection_config, sync_type, tenant_id
)
elif system == "kashflow":
sync_result = await _sync_kashflow_data(
connection_config, sync_type, tenant_id
)
else:
raise Exception(f"Sync not implemented for {system}")
# Update sync record
await _update_sync_record(sync_id, "completed", sync_result)
# Update metrics
metrics.counter("firm_syncs_completed_total").labels(
tenant_id=tenant_id, system=system, sync_type=sync_type
).inc()
metrics.histogram("sync_records_count").labels(
system=system, sync_type=sync_type
).observe(sync_result["records_synced"])
# Publish completion event
event_payload = EventPayload(
data={
"sync_id": sync_id,
"system": system,
"sync_type": sync_type,
"tenant_id": tenant_id,
"records_synced": sync_result["records_synced"],
"entities_created": sync_result.get("entities_created", 0),
},
actor=actor,
tenant_id=tenant_id,
)
await event_bus.publish(EventTopics.FIRM_SYNC_COMPLETED, event_payload) # type: ignore
logger.info(
"Firm sync completed",
sync_id=sync_id,
system=system,
records=sync_result["records_synced"],
)
except Exception as e:
logger.error("Firm sync failed", sync_id=sync_id, error=str(e))
# Update sync record with error
await _update_sync_record(sync_id, "error", {"error": str(e)})
# Update error metrics
metrics.counter("firm_sync_errors_total").labels(
tenant_id=tenant_id, system=system, error_type=type(e).__name__
).inc()
async def _test_iris_connection(config: dict[str, Any]) -> dict[str, Any]:
"""Test IRIS connection"""
# Mock implementation
await asyncio.sleep(1)
return {
"status": "success",
"message": "Connection successful",
"capabilities": ["clients", "jobs", "documents"],
}
async def _test_sage_connection(config: dict[str, Any]) -> dict[str, Any]:
"""Test Sage connection"""
# Mock implementation
await asyncio.sleep(1)
return {
"status": "success",
"message": "Connection successful",
"capabilities": ["customers", "suppliers", "transactions"],
}
async def _test_xero_connection(config: dict[str, Any]) -> dict[str, Any]:
"""Test Xero connection"""
# Mock implementation
await asyncio.sleep(1)
return {
"status": "success",
"message": "Connection successful",
"capabilities": ["contacts", "invoices", "bank_transactions"],
}
async def _test_quickbooks_connection(config: dict[str, Any]) -> dict[str, Any]:
"""Test QuickBooks connection"""
# Mock implementation
await asyncio.sleep(1)
return {
"status": "success",
"message": "Connection successful",
"capabilities": ["customers", "vendors", "items", "transactions"],
}
async def _test_freeagent_connection(config: dict[str, Any]) -> dict[str, Any]:
"""Test FreeAgent connection"""
# Mock implementation
await asyncio.sleep(1)
return {
"status": "success",
"message": "Connection successful",
"capabilities": ["contacts", "projects", "invoices", "expenses"],
}
async def _test_kashflow_connection(config: dict[str, Any]) -> dict[str, Any]:
"""Test KashFlow connection"""
# Mock implementation
await asyncio.sleep(1)
return {
"status": "success",
"message": "Connection successful",
"capabilities": ["customers", "suppliers", "invoices", "receipts"],
}
async def _sync_iris_data(
config: dict[str, Any], sync_type: str, tenant_id: str
) -> dict[str, Any]:
"""Sync data from IRIS"""
# Mock implementation
await asyncio.sleep(2)
# Simulate syncing client data
mock_clients = [
{"id": "client_1", "name": "John Doe", "utr": "1234567890"},
{"id": "client_2", "name": "Jane Smith", "utr": "0987654321"},
]
entities_created = 0
for client in mock_clients:
# Create taxpayer profile in KG
taxpayer_properties = {
"taxpayer_id": client["id"],
"name": client["name"],
"utr": client["utr"],
"tenant_id": tenant_id,
"source": "iris_sync",
"extractor_version": "1.0.0",
"valid_from": datetime.utcnow(),
"asserted_at": datetime.utcnow(),
}
await neo4j_client.create_node("TaxpayerProfile", taxpayer_properties) # fmt: skip # pyright: ignore[reportOptionalMemberAccess]
entities_created += 1
return {
"records_synced": len(mock_clients),
"entities_created": entities_created,
"sync_type": sync_type,
}
async def _sync_sage_data(
config: dict[str, Any], sync_type: str, tenant_id: str
) -> dict[str, Any]:
"""Sync data from Sage"""
# Mock implementation
await asyncio.sleep(2)
return {"records_synced": 5, "entities_created": 5, "sync_type": sync_type}
async def _sync_xero_data(
config: dict[str, Any], sync_type: str, tenant_id: str
) -> dict[str, Any]:
"""Sync data from Xero"""
# Mock implementation
await asyncio.sleep(2)
return {"records_synced": 8, "entities_created": 8, "sync_type": sync_type}
async def _sync_quickbooks_data(
config: dict[str, Any], sync_type: str, tenant_id: str
) -> dict[str, Any]:
"""Sync data from QuickBooks"""
# Mock implementation
await asyncio.sleep(2)
return {"records_synced": 12, "entities_created": 12, "sync_type": sync_type}
async def _sync_freeagent_data(
config: dict[str, Any], sync_type: str, tenant_id: str
) -> dict[str, Any]:
"""Sync data from FreeAgent"""
# Mock implementation
await asyncio.sleep(2)
return {"records_synced": 6, "entities_created": 6, "sync_type": sync_type}
async def _sync_kashflow_data(
config: dict[str, Any], sync_type: str, tenant_id: str
) -> dict[str, Any]:
"""Sync data from KashFlow"""
# Mock implementation
await asyncio.sleep(2)
return {"records_synced": 4, "entities_created": 4, "sync_type": sync_type}
def _get_system_name(system: str) -> str:
"""Get human-readable system name"""
names = {
"iris": "IRIS Practice Management",
"sage": "Sage Practice Management",
"xero": "Xero",
"quickbooks": "QuickBooks",
"freeagent": "FreeAgent",
"kashflow": "KashFlow",
}
return names.get(system, system.title())
def _get_system_capabilities(system: str) -> list[str]:
"""Get system capabilities"""
capabilities = {
"iris": ["clients", "jobs", "documents", "time_tracking"],
"sage": ["customers", "suppliers", "transactions", "reports"],
"xero": ["contacts", "invoices", "bank_transactions", "reports"],
"quickbooks": ["customers", "vendors", "items", "transactions", "reports"],
"freeagent": ["contacts", "projects", "invoices", "expenses", "time_tracking"],
"kashflow": ["customers", "suppliers", "invoices", "receipts", "reports"],
}
return capabilities.get(system, [])
def _get_connection_fields(system: str) -> list[dict[str, Any]]:
"""Get required connection fields for system"""
fields = {
"iris": [
{
"name": "api_key",
"type": "string",
"required": True,
"description": "IRIS API Key",
},
{
"name": "base_url",
"type": "string",
"required": True,
"description": "IRIS Base URL",
},
],
"sage": [
{
"name": "username",
"type": "string",
"required": True,
"description": "Sage Username",
},
{
"name": "password",
"type": "password",
"required": True,
"description": "Sage Password",
},
{
"name": "database",
"type": "string",
"required": True,
"description": "Database Name",
},
],
"xero": [
{
"name": "client_id",
"type": "string",
"required": True,
"description": "Xero Client ID",
},
{
"name": "client_secret",
"type": "password",
"required": True,
"description": "Xero Client Secret",
},
{
"name": "tenant_id",
"type": "string",
"required": True,
"description": "Xero Tenant ID",
},
],
"quickbooks": [
{
"name": "client_id",
"type": "string",
"required": True,
"description": "QuickBooks Client ID",
},
{
"name": "client_secret",
"type": "password",
"required": True,
"description": "QuickBooks Client Secret",
},
{
"name": "company_id",
"type": "string",
"required": True,
"description": "Company ID",
},
],
"freeagent": [
{
"name": "client_id",
"type": "string",
"required": True,
"description": "FreeAgent Client ID",
},
{
"name": "client_secret",
"type": "password",
"required": True,
"description": "FreeAgent Client Secret",
},
],
"kashflow": [
{
"name": "username",
"type": "string",
"required": True,
"description": "KashFlow Username",
},
{
"name": "password",
"type": "password",
"required": True,
"description": "KashFlow Password",
},
],
}
return fields.get(system, [])
async def _create_sync_record(
sync_id: str, system: str, sync_type: str, tenant_id: str
) -> None:
"""Create sync record in knowledge graph"""
sync_properties = {
"sync_id": sync_id,
"system": system,
"sync_type": sync_type,
"tenant_id": tenant_id,
"status": "running",
"started_at": datetime.utcnow().isoformat(),
"records_synced": 0,
"errors": "[]",
"source": "firm_connectors",
"extractor_version": "1.0.0",
"valid_from": datetime.utcnow(),
"asserted_at": datetime.utcnow(),
}
await neo4j_client.create_node("FirmSync", sync_properties) # fmt: skip # pyright: ignore[reportOptionalMemberAccess]
async def _update_sync_record(
sync_id: str, status: str, result: dict[str, Any]
) -> None:
"""Update sync record with results"""
update_properties = {
"status": status,
"completed_at": datetime.utcnow().isoformat(),
"records_synced": result.get("records_synced", 0),
"total_records": result.get("total_records", 0),
"errors": json.dumps(result.get("errors", [])),
}
# This would update the existing node
# For now, just log
logger.debug(
"Sync record updated",
sync_id=sync_id,
status=status,
properties=update_properties,
)
@app.exception_handler(HTTPException)
async def http_exception_handler(request: Request, exc: HTTPException) -> JSONResponse:
"""Handle HTTP exceptions with RFC7807 format"""
return JSONResponse(
status_code=exc.status_code,
content=ErrorResponse(
type=f"https://httpstatuses.com/{exc.status_code}",
title=exc.detail,
status=exc.status_code,
detail=exc.detail,
instance=str(request.url),
trace_id="",
).model_dump(),
)
if __name__ == "__main__":
import uvicorn
uvicorn.run("main:app", host="0.0.0.0", port=8011, reload=True, log_config=None)

View File

@@ -0,0 +1,45 @@
# FastAPI and server
fastapi>=0.104.1
uvicorn[standard]>=0.24.0
pydantic>=2.5.0
# Service-specific dependencies
# Database connectors
sqlalchemy>=2.0.0
pymssql>=2.2.0
cx-Oracle>=8.3.0
# API clients for practice management systems
zeep>=4.2.0 # SOAP client
xmltodict>=0.13.0
# OAuth for various systems
authlib>=1.2.0
requests-oauthlib>=1.3.0
# Data synchronization
pandas>=2.1.0
# Rate limiting
ratelimit>=2.2.0
# Retry mechanisms
tenacity>=8.2.0
# CSV processing
csvkit>=1.1.0
# Excel file processing
openpyxl>=3.1.0
xlrd>=2.0.0
# Data validation
marshmallow>=3.20.0
cerberus>=1.3.4
# Connection pooling (built into SQLAlchemy)
# sqlalchemy-pool>=1.3.0 # Package doesn't exist, pooling is built into SQLAlchemy
# Additional utilities
python-dateutil>=2.8.0
pytz>=2023.3

53
apps/svc_forms/Dockerfile Normal file
View File

@@ -0,0 +1,53 @@
# Multi-stage build for svc_forms
FROM python:3.12-slim AS builder
# Install build dependencies
RUN apt-get update && apt-get install -y \
build-essential \
curl \
&& rm -rf /var/lib/apt/lists/*
# Create virtual environment
RUN python -m venv /opt/venv
ENV PATH="/opt/venv/bin:$PATH"
# Copy requirements and install dependencies
COPY libs/requirements-base.txt /tmp/libs-requirements.txt
COPY apps/svc_forms/requirements.txt /tmp/requirements.txt
RUN pip install --no-cache-dir --upgrade pip && \
pip install --no-cache-dir -r /tmp/libs-requirements.txt -r /tmp/requirements.txt
# Production stage
FROM python:3.12-slim
# Install runtime dependencies
RUN apt-get update && apt-get install -y \
curl \
&& rm -rf /var/lib/apt/lists/* \
&& groupadd -r appuser \
&& useradd -r -g appuser appuser
# Copy virtual environment from builder
COPY --from=builder /opt/venv /opt/venv
ENV PATH="/opt/venv/bin:$PATH"
# Set working directory
WORKDIR /app
# Copy application code
COPY libs/ ./libs/
COPY apps/svc_forms/ ./apps/svc_forms/
# Create non-root user and set permissions
RUN chown -R appuser:appuser /app
USER appuser
# Health check
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
CMD curl -f http://localhost:8000/healthz || exit 1
# Expose port
EXPOSE 8000
# Run the application
CMD ["python", "-m", "uvicorn", "apps.svc_forms.main:app", "--host", "0.0.0.0", "--port", "8000"]

625
apps/svc_forms/main.py Normal file
View File

@@ -0,0 +1,625 @@
"""PDF form filling with evidence pack generation."""
# FILE: apps/svc-forms/main.py
# pylint: disable=wrong-import-position,import-error,too-few-public-methods,global-statement
# pylint: disable=global-variable-not-assigned,raise-missing-from,unused-argument
# pylint: disable=broad-exception-caught,no-else-return,too-many-arguments,too-many-positional-arguments
# pylint: disable=too-many-locals,import-outside-toplevel
# mypy: disable-error-code=union-attr
import os
# Import shared libraries
import sys
from datetime import datetime
from io import BytesIO
from typing import Any
import structlog
import ulid
from fastapi import BackgroundTasks, Depends, HTTPException, Request
from fastapi.responses import JSONResponse, Response
sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
from libs.app_factory import create_app
from libs.config import (
BaseAppSettings,
create_event_bus,
create_minio_client,
create_neo4j_client,
)
from libs.events import EventBus, EventPayload, EventTopics
from libs.forms import UK_TAX_FORMS, EvidencePackGenerator, PDFFormFiller
from libs.neo import Neo4jClient
from libs.observability import get_metrics, get_tracer, setup_observability
from libs.schemas import ErrorResponse
from libs.security import get_current_user, get_tenant_id
from libs.storage import DocumentStorage, StorageClient
logger = structlog.get_logger()
class FormsSettings(BaseAppSettings):
"""Settings for forms service"""
service_name: str = "svc-forms"
# Form templates
forms_template_dir: str = "forms/templates"
output_bucket: str = "filled-forms"
evidence_packs_bucket: str = "evidence-packs"
# Supported forms
supported_forms: list[str] = ["SA100", "SA103", "SA105", "SA106"]
# PDF configuration
pdf_quality: str = "high"
flatten_forms: bool = True
# Create app and settings
app, settings = create_app(
service_name="svc-forms",
title="Tax Agent Forms Service",
description="PDF form filling and evidence pack generation",
settings_class=FormsSettings,
)
# Global clients
storage_client: StorageClient | None = None
document_storage: DocumentStorage | None = None
neo4j_client: Neo4jClient | None = None
pdf_form_filler: PDFFormFiller | None = None
evidence_pack_generator: EvidencePackGenerator | None = None
event_bus: EventBus | None = None
tracer = get_tracer("svc-forms")
metrics = get_metrics()
@app.on_event("startup")
async def startup_event() -> None:
"""Initialize service dependencies"""
global storage_client, document_storage, neo4j_client, pdf_form_filler # pylint: disable=line-too-long
global evidence_pack_generator, event_bus
logger.info("Starting forms service")
# Setup observability
setup_observability(settings)
# Initialize MinIO client
minio_client = create_minio_client(settings)
storage_client = StorageClient(minio_client)
document_storage = DocumentStorage(storage_client)
# Initialize Neo4j client
neo4j_driver = create_neo4j_client(settings)
neo4j_client = Neo4jClient(neo4j_driver)
# Initialize PDF form filler
pdf_form_filler = PDFFormFiller()
# Load form templates
for form_id in settings.supported_forms:
template_path = os.path.join(settings.forms_template_dir, f"{form_id}.pdf")
if os.path.exists(template_path):
pdf_form_filler.load_template(form_id, template_path)
else:
logger.warning(
"Form template not found", form_id=form_id, path=template_path
)
# Initialize evidence pack generator
evidence_pack_generator = EvidencePackGenerator(storage_client)
# Initialize event bus
event_bus = create_event_bus(settings)
await event_bus.start() # fmt: skip # pyright: ignore[reportOptionalMemberAccess]
# Subscribe to calculation completion events
await event_bus.subscribe( # fmt: skip # pyright: ignore[reportOptionalMemberAccess]
EventTopics.CALC_SCHEDULE_READY, _handle_calculation_ready
)
# Ensure buckets exist
await storage_client.ensure_bucket(settings.output_bucket)
await storage_client.ensure_bucket(settings.evidence_packs_bucket)
logger.info("Forms service started successfully")
@app.on_event("shutdown")
async def shutdown_event() -> None:
"""Cleanup service dependencies"""
global neo4j_client, event_bus
logger.info("Shutting down forms service")
if neo4j_client:
await neo4j_client.close()
if event_bus:
await event_bus.stop()
logger.info("Forms service shutdown complete")
@app.get("/health")
async def health_check() -> dict[str, Any]:
"""Health check endpoint"""
return {
"status": "healthy",
"service": settings.service_name,
"version": "1.0.0",
"timestamp": datetime.now().isoformat(),
"supported_forms": settings.supported_forms,
}
@app.post("/fill/{form_id}")
async def fill_form(
form_id: str,
field_values: dict[str, Any],
background_tasks: BackgroundTasks,
current_user: dict[str, Any] = Depends(get_current_user),
tenant_id: str = Depends(get_tenant_id),
) -> dict[str, Any]:
"""Fill PDF form with provided values"""
with tracer.start_as_current_span("fill_form") as span:
span.set_attribute("form_id", form_id)
span.set_attribute("tenant_id", tenant_id)
span.set_attribute("field_count", len(field_values))
try:
# Validate form ID
if form_id not in settings.supported_forms:
raise HTTPException(
status_code=400, detail=f"Unsupported form: {form_id}"
)
# Generate filling ID
filling_id = str(ulid.new())
span.set_attribute("filling_id", filling_id)
# Start background form filling
background_tasks.add_task(
_fill_form_async,
form_id,
field_values,
tenant_id,
filling_id,
current_user.get("sub", "system"),
)
logger.info("Form filling started", form_id=form_id, filling_id=filling_id)
return {
"filling_id": filling_id,
"form_id": form_id,
"status": "filling",
"field_count": len(field_values),
}
except HTTPException:
raise
except Exception as e:
logger.error("Failed to start form filling", form_id=form_id, error=str(e))
raise HTTPException(status_code=500, detail="Failed to start form filling")
@app.post("/fill-from-calculation/{calculation_id}")
async def fill_form_from_calculation(
calculation_id: str,
background_tasks: BackgroundTasks,
current_user: dict[str, Any] = Depends(get_current_user),
tenant_id: str = Depends(get_tenant_id),
) -> dict[str, Any]:
"""Fill form using calculation results"""
with tracer.start_as_current_span("fill_form_from_calculation") as span:
span.set_attribute("calculation_id", calculation_id)
span.set_attribute("tenant_id", tenant_id)
try:
# Get calculation from Neo4j
calc_query = """
MATCH (c:Calculation {calculation_id: $calculation_id, tenant_id: $tenant_id})
WHERE c.retracted_at IS NULL
RETURN c
"""
calc_results = await neo4j_client.run_query( # pyright: ignore[reportOptionalMemberAccess]
calc_query, {"calculation_id": calculation_id, "tenant_id": tenant_id}
)
if not calc_results:
raise HTTPException(status_code=404, detail="Calculation not found")
calculation = calc_results[0]["c"]
form_id = calculation.get("schedule")
if not form_id:
raise HTTPException(
status_code=400, detail="No schedule found in calculation"
)
# Get form boxes
boxes_query = """
MATCH (c:Calculation {calculation_id: $calculation_id})-[:HAS_BOX]->(b:FormBox)
WHERE c.retracted_at IS NULL AND b.retracted_at IS NULL
RETURN b
"""
box_results = await neo4j_client.run_query( # pyright: ignore[reportOptionalMemberAccess]
boxes_query, {"calculation_id": calculation_id}
)
# Convert form boxes to field values
field_values = {}
for box_result in box_results:
box = box_result["b"]
field_values[f"box_{box['box']}"] = box["value"]
# Generate filling ID
filling_id = str(ulid.new())
span.set_attribute("filling_id", filling_id)
span.set_attribute("form_id", form_id)
# Start background form filling
background_tasks.add_task(
_fill_form_async,
form_id,
field_values,
tenant_id,
filling_id,
current_user.get("sub", "system"),
calculation_id,
)
logger.info(
"Form filling from calculation started",
form_id=form_id,
filling_id=filling_id,
calculation_id=calculation_id,
)
return {
"filling_id": filling_id,
"form_id": form_id,
"calculation_id": calculation_id,
"status": "filling",
"field_count": len(field_values),
}
except HTTPException:
raise
except Exception as e:
logger.error(
"Failed to fill form from calculation",
calculation_id=calculation_id,
error=str(e),
)
raise HTTPException(
status_code=500, detail="Failed to fill form from calculation"
)
@app.get("/download/{filling_id}")
async def download_filled_form(
filling_id: str,
current_user: dict[str, Any] = Depends(get_current_user),
tenant_id: str = Depends(get_tenant_id),
) -> Response:
"""Download filled form"""
with tracer.start_as_current_span("download_filled_form") as span:
span.set_attribute("filling_id", filling_id)
span.set_attribute("tenant_id", tenant_id)
try:
# Get filled form from storage
object_key = f"tenants/{tenant_id}/filled/{filling_id}.pdf"
form_content = await storage_client.get_object( # pyright: ignore[reportOptionalMemberAccess]
settings.output_bucket, object_key
)
if not form_content:
raise HTTPException(status_code=404, detail="Filled form not found")
return Response(
content=form_content,
media_type="application/pdf",
headers={
"Content-Disposition": f"attachment; filename={filling_id}.pdf"
},
)
except HTTPException:
raise
except Exception as e:
logger.error(
"Failed to download filled form", filling_id=filling_id, error=str(e)
)
raise HTTPException(
status_code=500, detail="Failed to download filled form"
)
@app.post("/evidence-pack")
async def create_evidence_pack(
taxpayer_id: str,
tax_year: str,
scope: str,
evidence_items: list[dict[str, Any]],
background_tasks: BackgroundTasks,
current_user: dict[str, Any] = Depends(get_current_user),
tenant_id: str = Depends(get_tenant_id),
) -> dict[str, Any]:
"""Create evidence pack with supporting documents"""
with tracer.start_as_current_span("create_evidence_pack") as span:
span.set_attribute("taxpayer_id", taxpayer_id)
span.set_attribute("tax_year", tax_year)
span.set_attribute("scope", scope)
span.set_attribute("tenant_id", tenant_id)
span.set_attribute("evidence_count", len(evidence_items))
try:
# Generate pack ID
pack_id = str(ulid.new())
span.set_attribute("pack_id", pack_id)
# Start background pack creation
background_tasks.add_task(
_create_evidence_pack_async,
taxpayer_id,
tax_year,
scope,
evidence_items,
tenant_id,
pack_id,
current_user.get("sub", "system"),
)
logger.info(
"Evidence pack creation started",
pack_id=pack_id,
taxpayer_id=taxpayer_id,
scope=scope,
)
return {
"pack_id": pack_id,
"taxpayer_id": taxpayer_id,
"tax_year": tax_year,
"scope": scope,
"status": "creating",
"evidence_count": len(evidence_items),
}
except Exception as e:
logger.error("Failed to start evidence pack creation", error=str(e))
raise HTTPException(
status_code=500, detail="Failed to start evidence pack creation"
)
@app.get("/forms")
async def list_supported_forms(
current_user: dict[str, Any] = Depends(get_current_user),
tenant_id: str = Depends(get_tenant_id),
) -> dict[str, Any]:
"""List supported forms with field information"""
try:
forms_info = []
for form_id in settings.supported_forms:
form_config = UK_TAX_FORMS.get(form_id, {})
# Get form fields if template is loaded
fields = []
if pdf_form_filler and form_id in pdf_form_filler.form_templates:
fields = pdf_form_filler.get_form_fields(form_id)
forms_info.append(
{
"form_id": form_id,
"name": form_config.get("name", form_id),
"template_available": form_id
in (pdf_form_filler.form_templates if pdf_form_filler else {}),
"field_count": len(fields),
"fields": fields[:10], # Limit to first 10 fields for overview
}
)
return {"supported_forms": forms_info, "total_forms": len(forms_info)}
except Exception as e:
logger.error("Failed to list forms", error=str(e))
raise HTTPException(status_code=500, detail="Failed to list forms")
async def _handle_calculation_ready(topic: str, payload: EventPayload) -> None:
"""Handle calculation completion events for auto-form filling"""
try:
data = payload.data
calculation_id = data.get("calculation_id")
schedule = data.get("schedule")
tenant_id = data.get("tenant_id")
if not calculation_id or not schedule or not tenant_id:
logger.warning("Invalid calculation ready event", data=data)
return
logger.info(
"Auto-filling form from calculation",
calculation_id=calculation_id,
schedule=schedule,
)
# Get form boxes from event data
form_boxes = data.get("form_boxes", {})
# Convert to field values
field_values = {}
for box_id, box_data in form_boxes.items():
field_values[f"box_{box_id}"] = box_data.get("value")
await _fill_form_async(
form_id=schedule,
field_values=field_values,
tenant_id=tenant_id,
filling_id=str(ulid.new()),
actor=payload.actor,
calculation_id=calculation_id,
)
except Exception as e:
logger.error("Failed to handle calculation ready event", error=str(e))
async def _fill_form_async(
form_id: str,
field_values: dict[str, Any],
tenant_id: str,
filling_id: str,
actor: str,
calculation_id: str | None = None,
) -> None:
"""Fill form asynchronously"""
with tracer.start_as_current_span("fill_form_async") as span:
span.set_attribute("form_id", form_id)
span.set_attribute("filling_id", filling_id)
span.set_attribute("tenant_id", tenant_id)
try:
# Fill the form
filled_pdf = pdf_form_filler.fill_form(form_id, field_values) # fmt: skip # pyright: ignore[reportOptionalMemberAccess]
if not filled_pdf:
# pylint: disable-next=broad-exception-raised
raise Exception("Form filling failed")
# Store filled form
object_key = f"tenants/{tenant_id}/filled/{filling_id}.pdf"
success = await storage_client.put_object( # fmt: skip # pyright: ignore[reportOptionalMemberAccess]
bucket_name=settings.output_bucket,
object_name=object_key,
data=BytesIO(filled_pdf),
length=len(filled_pdf),
content_type="application/pdf",
metadata={
"form_id": form_id,
"filling_id": filling_id,
"tenant_id": tenant_id,
"calculation_id": calculation_id or "",
"filled_at": datetime.utcnow().isoformat(),
},
)
if not success:
# pylint: disable-next=broad-exception-raised
raise Exception("Failed to store filled form")
# Update metrics
metrics.counter("forms_filled_total").labels(
tenant_id=tenant_id, form_id=form_id
).inc()
# Publish completion event
event_payload = EventPayload(
data={
"filling_id": filling_id,
"form_id": form_id,
"tenant_id": tenant_id,
"calculation_id": calculation_id,
"s3_url": f"s3://{settings.output_bucket}/{object_key}",
"field_count": len(field_values),
},
actor=actor,
tenant_id=tenant_id,
)
await event_bus.publish(EventTopics.FORM_FILLED, event_payload) # fmt: skip # pyright: ignore[reportOptionalMemberAccess]
logger.info(
"Form filling completed", filling_id=filling_id, form_id=form_id
)
except Exception as e:
logger.error("Form filling failed", filling_id=filling_id, error=str(e))
# Update error metrics
metrics.counter("form_filling_errors_total").labels(
tenant_id=tenant_id, form_id=form_id, error_type=type(e).__name__
).inc()
async def _create_evidence_pack_async(
taxpayer_id: str,
tax_year: str,
scope: str,
evidence_items: list[dict[str, Any]],
tenant_id: str,
pack_id: str,
actor: str,
) -> None:
"""Create evidence pack asynchronously"""
with tracer.start_as_current_span("create_evidence_pack_async") as span:
span.set_attribute("pack_id", pack_id)
span.set_attribute("taxpayer_id", taxpayer_id)
span.set_attribute("scope", scope)
try:
# Create evidence pack
pack_result = await evidence_pack_generator.create_evidence_pack( # fmt: skip # pyright: ignore[reportOptionalMemberAccess]
taxpayer_id=taxpayer_id,
tax_year=tax_year,
scope=scope,
evidence_items=evidence_items,
)
# Update metrics
metrics.counter("evidence_packs_created_total").labels(
tenant_id=tenant_id, scope=scope
).inc()
logger.info(
"Evidence pack created",
pack_id=pack_id,
pack_size=pack_result["pack_size"],
evidence_count=pack_result["evidence_count"],
)
except Exception as e:
logger.error("Evidence pack creation failed", pack_id=pack_id, error=str(e))
@app.exception_handler(HTTPException)
async def http_exception_handler(request: Request, exc: HTTPException) -> JSONResponse:
"""Handle HTTP exceptions with RFC7807 format"""
return JSONResponse(
status_code=exc.status_code,
content=ErrorResponse(
type=f"https://httpstatuses.com/{exc.status_code}",
title=exc.detail,
status=exc.status_code,
detail=exc.detail,
instance=str(request.url),
trace_id="",
).model_dump(),
)
if __name__ == "__main__":
import uvicorn
uvicorn.run("main:app", host="0.0.0.0", port=8009, reload=True, log_config=None)

View File

@@ -0,0 +1,37 @@
# FastAPI and server
fastapi>=0.104.1
uvicorn[standard]>=0.24.0
pydantic>=2.5.0
# Service-specific dependencies
# PDF form filling
pdfrw>=0.4
reportlab>=4.0.0
# PDF processing
PyPDF2>=3.0.0
pypdf>=3.17.0
# Image processing for overlays
Pillow>=10.1.0
# ZIP file creation for evidence packs
zipfile36>=0.1.3
# Template processing
jinja2>=3.1.0
# QR code generation
qrcode>=7.4.0
# Barcode generation
python-barcode>=0.15.0
# Font handling
fonttools>=4.44.0
# Additional PDF utilities
pdfminer.six>=20231228
# Document conversion
python-docx>=1.1.0

54
apps/svc_hmrc/Dockerfile Normal file
View File

@@ -0,0 +1,54 @@
# Multi-stage build for svc_hmrc
FROM python:3.12-slim AS builder
# Install build dependencies
RUN apt-get update && apt-get install -y \
build-essential \
curl \
&& rm -rf /var/lib/apt/lists/*
# Create virtual environment
RUN python -m venv /opt/venv
ENV PATH="/opt/venv/bin:$PATH"
# Copy requirements and install dependencies
COPY libs/requirements-base.txt /tmp/libs-requirements.txt
COPY apps/svc_hmrc/requirements.txt /tmp/requirements.txt
RUN pip install --no-cache-dir --upgrade pip && \
pip install --no-cache-dir -r /tmp/libs-requirements.txt -r /tmp/requirements.txt
# Production stage
FROM python:3.12-slim
# Install runtime dependencies
RUN apt-get update && apt-get install -y \
curl \
&& rm -rf /var/lib/apt/lists/* \
&& groupadd -r appuser \
&& useradd -r -g appuser appuser
# Copy virtual environment from builder
COPY --from=builder /opt/venv /opt/venv
ENV PATH="/opt/venv/bin:$PATH"
# Set working directory
WORKDIR /app
# Copy application code
COPY libs/ ./libs/
COPY apps/svc_hmrc/ ./apps/svc_hmrc/
# Create non-root user and set permissions
RUN chown -R appuser:appuser /app
USER appuser
# Health check
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
CMD curl -f http://localhost:8000/healthz || exit 1
# Expose port
EXPOSE 8000
# Run the application
CMD ["python", "-m", "uvicorn", "apps.svc_hmrc.main:app", "--host", "0.0.0.0", "--port", "8000"]

759
apps/svc_hmrc/main.py Normal file
View File

@@ -0,0 +1,759 @@
# FILE: apps/svc-hmrc/main.py
# HMRC submission service with MTD API integration and validation
import asyncio
import json
import os
# Import shared libraries
import sys
from datetime import datetime
from typing import Any
import structlog
import ulid
from fastapi import BackgroundTasks, Depends, HTTPException, Request
from fastapi.responses import JSONResponse
sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
from libs.app_factory import create_app
from libs.config import (
BaseAppSettings,
create_event_bus,
create_neo4j_client,
create_vault_client,
)
from libs.events import EventBus, EventPayload, EventTopics
from libs.neo import Neo4jClient
from libs.observability import get_metrics, get_tracer, setup_observability
from libs.schemas import ErrorResponse, HMRCSubmissionRequest, HMRCSubmissionResponse
from libs.security import VaultTransitHelper, get_current_user, get_tenant_id
logger = structlog.get_logger()
class HMRCSettings(BaseAppSettings):
"""Settings for HMRC service"""
service_name: str = "svc-hmrc"
# HMRC API configuration
hmrc_base_url: str = "https://api.service.hmrc.gov.uk"
hmrc_sandbox_url: str = "https://test-api.service.hmrc.gov.uk"
use_sandbox: bool = True
# OAuth configuration
client_id: str = ""
client_secret: str = ""
redirect_uri: str = "http://localhost:8000/oauth/callback"
# API endpoints
mtd_income_tax_endpoint: str = (
"/income-tax/self-assessment/ni/{nino}/uk-property/{taxYear}"
)
mtd_self_employment_endpoint: str = (
"/income-tax/self-assessment/ni/{nino}/self-employment/{businessId}"
)
# Validation
max_submission_retries: int = 3
submission_timeout: int = 300 # 5 minutes
# Create app and settings
app, settings = create_app(
service_name="svc-hmrc",
title="Tax Agent HMRC Service",
description="HMRC submission service with MTD API integration",
settings_class=HMRCSettings,
)
# Global clients
vault_helper: VaultTransitHelper | None = None
neo4j_client: Neo4jClient | None = None
event_bus: EventBus | None = None
tracer = get_tracer("svc-hmrc")
metrics = get_metrics()
@app.on_event("startup")
async def startup_event() -> None:
"""Initialize service dependencies"""
global vault_helper, neo4j_client, event_bus
logger.info("Starting HMRC service")
# Setup observability
setup_observability(settings)
# Initialize Vault helper
vault_client = create_vault_client(settings)
vault_helper = VaultTransitHelper(vault_client, "tax-agent-transit")
# Initialize Neo4j client
neo4j_driver = create_neo4j_client(settings)
neo4j_client = Neo4jClient(neo4j_driver)
# Initialize event bus
event_bus = create_event_bus(settings)
if not event_bus:
raise Exception("Event bus not initialized")
await event_bus.start()
# Subscribe to form completion events
await event_bus.subscribe(EventTopics.FORM_FILLED, _handle_form_filled) # type: ignore
logger.info("HMRC service started successfully")
@app.on_event("shutdown")
async def shutdown_event() -> None:
"""Cleanup service dependencies"""
global neo4j_client, event_bus
logger.info("Shutting down HMRC service")
if neo4j_client:
await neo4j_client.close()
if event_bus:
await event_bus.stop()
logger.info("HMRC service shutdown complete")
@app.get("/health")
async def health_check() -> dict[str, Any]:
"""Health check endpoint"""
return {
"status": "healthy",
"service": settings.service_name,
"version": settings.service_version,
"timestamp": datetime.utcnow().isoformat(),
"hmrc_environment": "sandbox" if settings.use_sandbox else "production",
}
@app.post("/submit", response_model=HMRCSubmissionResponse)
async def submit_to_hmrc(
request_data: HMRCSubmissionRequest,
background_tasks: BackgroundTasks,
current_user: dict[str, Any] = Depends(get_current_user),
tenant_id: str = Depends(get_tenant_id),
) -> HMRCSubmissionResponse:
"""Submit tax return to HMRC"""
with tracer.start_as_current_span("submit_to_hmrc") as span:
span.set_attribute("tax_year", request_data.tax_year)
span.set_attribute("taxpayer_id", request_data.taxpayer_id)
span.set_attribute("tenant_id", tenant_id)
span.set_attribute("dry_run", request_data.dry_run)
try:
# Generate submission ID
submission_id = str(ulid.new())
span.set_attribute("submission_id", submission_id)
# Start background submission
background_tasks.add_task(
_submit_to_hmrc_async,
request_data.tax_year,
request_data.taxpayer_id,
request_data.dry_run,
tenant_id,
submission_id,
current_user.get("sub", "system"),
)
logger.info(
"HMRC submission started",
submission_id=submission_id,
taxpayer_id=request_data.taxpayer_id,
dry_run=request_data.dry_run,
)
return HMRCSubmissionResponse(
submission_id=submission_id,
status="processing",
hmrc_reference=None,
submission_timestamp=datetime.utcnow(),
validation_results={},
dry_run=request_data.dry_run,
)
except Exception as e:
logger.error("Failed to start HMRC submission", error=str(e))
raise HTTPException(
status_code=500, detail="Failed to start HMRC submission"
)
@app.get("/submissions/{submission_id}")
async def get_submission_status(
submission_id: str,
current_user: dict[str, Any] = Depends(get_current_user),
tenant_id: str = Depends(get_tenant_id),
) -> dict[str, Any]:
"""Get submission status"""
with tracer.start_as_current_span("get_submission_status") as span:
span.set_attribute("submission_id", submission_id)
span.set_attribute("tenant_id", tenant_id)
try:
# Get submission from Neo4j
query = """
MATCH (s:Submission {submission_id: $submission_id, tenant_id: $tenant_id})
WHERE s.retracted_at IS NULL
RETURN s
"""
if not neo4j_client:
raise Exception("Neo4j client not initialized")
results = await neo4j_client.run_query( # fmt: skip # pyright: ignore[reportOptionalMemberAccess]
query, {"submission_id": submission_id, "tenant_id": tenant_id}
)
if not results:
raise HTTPException(status_code=404, detail="Submission not found")
submission = results[0]["s"]
return {
"submission_id": submission_id,
"status": submission.get("status"),
"hmrc_reference": submission.get("hmrc_reference"),
"submission_timestamp": submission.get("submission_timestamp"),
"validation_results": json.loads(
submission.get("validation_results", "{}")
),
"dry_run": submission.get("dry_run", False),
"error_message": submission.get("error_message"),
}
except HTTPException:
raise
except Exception as e:
logger.error(
"Failed to get submission status",
submission_id=submission_id,
error=str(e),
)
raise HTTPException(
status_code=500, detail="Failed to get submission status"
)
@app.post("/oauth/authorize")
async def initiate_oauth_flow(
taxpayer_id: str,
current_user: dict[str, Any] = Depends(get_current_user),
tenant_id: str = Depends(get_tenant_id),
) -> dict[str, Any]:
"""Initiate OAuth flow for HMRC authorization"""
with tracer.start_as_current_span("initiate_oauth") as span:
span.set_attribute("taxpayer_id", taxpayer_id)
span.set_attribute("tenant_id", tenant_id)
try:
# Generate state parameter for security
state = str(ulid.new())
# Build authorization URL
base_url = (
settings.hmrc_sandbox_url
if settings.use_sandbox
else settings.hmrc_base_url
)
auth_url = f"{base_url}/oauth/authorize"
params = {
"response_type": "code",
"client_id": settings.client_id,
"scope": "read:self-assessment write:self-assessment",
"state": state,
"redirect_uri": settings.redirect_uri,
}
# Store state for validation
await _store_oauth_state(state, taxpayer_id, tenant_id)
# Build full URL
param_string = "&".join([f"{k}={v}" for k, v in params.items()])
full_auth_url = f"{auth_url}?{param_string}"
return {
"authorization_url": full_auth_url,
"state": state,
"expires_in": 600, # 10 minutes
}
except Exception as e:
logger.error("Failed to initiate OAuth flow", error=str(e))
raise HTTPException(status_code=500, detail="Failed to initiate OAuth flow")
@app.post("/oauth/callback")
async def handle_oauth_callback(
code: str,
state: str,
current_user: dict[str, Any] = Depends(get_current_user),
tenant_id: str = Depends(get_tenant_id),
) -> dict[str, Any]:
"""Handle OAuth callback from HMRC"""
with tracer.start_as_current_span("handle_oauth_callback") as span:
span.set_attribute("state", state)
span.set_attribute("tenant_id", tenant_id)
if not neo4j_client:
raise HTTPException(status_code=500, detail="Neo4j client not initialized")
try:
# Validate state
oauth_data = await _get_oauth_state(state)
if not oauth_data or oauth_data.get("tenant_id") != tenant_id:
raise HTTPException(status_code=400, detail="Invalid state parameter")
# Exchange code for access token
token_data = await _exchange_code_for_token(code)
# Store encrypted tokens
if vault_helper is None:
raise HTTPException(
status_code=500, detail="Vault helper not initialized"
)
encrypted_access_token = vault_helper.encrypt_field(
"hmrc-access-token", token_data["access_token"]
)
encrypted_refresh_token = vault_helper.encrypt_field(
"hmrc-refresh-token", token_data.get("refresh_token", "")
)
# Store authorization in Neo4j
auth_properties = {
"taxpayer_id": oauth_data["taxpayer_id"],
"tenant_id": tenant_id,
"access_token": encrypted_access_token,
"refresh_token": encrypted_refresh_token,
"expires_at": datetime.utcnow().timestamp()
+ token_data.get("expires_in", 3600),
"scope": token_data.get("scope", ""),
"authorized_at": datetime.utcnow().isoformat(),
"source": "oauth_flow",
"extractor_version": "1.0.0",
"valid_from": datetime.utcnow(),
"asserted_at": datetime.utcnow(),
}
await neo4j_client.create_node("HMRCAuthorization", auth_properties) # fmt: skip # pyright: ignore[reportOptionalMemberAccess]
# Clean up state
await _delete_oauth_state(state)
return {
"status": "authorized",
"taxpayer_id": oauth_data["taxpayer_id"],
"scope": token_data.get("scope", ""),
"expires_in": token_data.get("expires_in", 3600),
}
except HTTPException:
raise
except Exception as e:
logger.error("OAuth callback failed", error=str(e))
raise HTTPException(status_code=500, detail="OAuth callback failed")
async def _handle_form_filled(topic: str, payload: EventPayload) -> None:
"""Handle form completion events for auto-submission"""
try:
if not neo4j_client:
raise Exception("Neo4j client not initialized")
data = payload.data
form_id = data.get("form_id")
tenant_id = data.get("tenant_id")
calculation_id = data.get("calculation_id")
if not form_id or not tenant_id:
logger.warning("Invalid form filled event", data=data)
return
# Only auto-submit if configured (this would be a tenant setting)
auto_submit = False # Default to false for safety
if auto_submit and calculation_id:
logger.info(
"Auto-submitting form to HMRC",
form_id=form_id,
calculation_id=calculation_id,
)
# Get taxpayer ID from calculation
calc_query = """
MATCH (c:Calculation {calculation_id: $calculation_id})
WHERE c.retracted_at IS NULL
RETURN c.taxpayer_id as taxpayer_id, c.tax_year as tax_year
"""
calc_results = await neo4j_client.run_query( # fmt: skip # pyright: ignore[reportOptionalMemberAccess]
calc_query, {"calculation_id": calculation_id}
)
if calc_results:
taxpayer_id = calc_results[0]["taxpayer_id"]
tax_year = calc_results[0]["tax_year"]
await _submit_to_hmrc_async(
tax_year=tax_year,
taxpayer_id=taxpayer_id,
dry_run=True, # Always dry run for auto-submission
tenant_id=tenant_id,
submission_id=str(ulid.new()),
actor=payload.actor,
)
except Exception as e:
logger.error("Failed to handle form filled event", error=str(e))
async def _submit_to_hmrc_async(
tax_year: str,
taxpayer_id: str,
dry_run: bool,
tenant_id: str,
submission_id: str,
actor: str,
) -> None:
"""Submit to HMRC asynchronously"""
with tracer.start_as_current_span("submit_to_hmrc_async") as span:
span.set_attribute("submission_id", submission_id)
span.set_attribute("taxpayer_id", taxpayer_id)
span.set_attribute("dry_run", dry_run)
if not event_bus:
raise Exception("Event bus not initialized")
try:
# Get taxpayer data
taxpayer_data = await _get_taxpayer_data(taxpayer_id, tenant_id)
# Get calculation data
calculation_data = await _get_latest_calculation(
taxpayer_id, tax_year, tenant_id
)
# Validate data
validation_results = await _validate_submission_data(
taxpayer_data, calculation_data
)
# Prepare submission
submission_data = await _prepare_submission_data(
taxpayer_data, calculation_data, tax_year
)
# Submit to HMRC (or simulate if dry run)
if dry_run:
hmrc_response = await _simulate_hmrc_submission(submission_data)
else:
hmrc_response = await _submit_to_hmrc_api(
submission_data, taxpayer_id, tenant_id
)
# Store submission record
await _store_submission_record(
submission_id,
taxpayer_id,
tax_year,
tenant_id,
hmrc_response,
validation_results,
dry_run,
)
# Update metrics
metrics.counter("hmrc_submissions_total").labels(
tenant_id=tenant_id,
dry_run=str(dry_run),
status=hmrc_response.get("status", "unknown"),
).inc()
# Publish completion event
event_payload = EventPayload(
data={
"submission_id": submission_id,
"taxpayer_id": taxpayer_id,
"tax_year": tax_year,
"tenant_id": tenant_id,
"status": hmrc_response.get("status"),
"hmrc_reference": hmrc_response.get("reference"),
"dry_run": dry_run,
},
actor=actor,
tenant_id=tenant_id,
)
await event_bus.publish(EventTopics.HMRC_SUBMITTED, event_payload) # fmt: skip # pyright: ignore[reportOptionalMemberAccess]
logger.info(
"HMRC submission completed",
submission_id=submission_id,
status=hmrc_response.get("status"),
dry_run=dry_run,
)
except Exception as e:
logger.error(
"HMRC submission failed", submission_id=submission_id, error=str(e)
)
# Store error record
await _store_submission_error(submission_id, str(e), tenant_id)
# Update error metrics
metrics.counter("hmrc_submission_errors_total").labels(
tenant_id=tenant_id, error_type=type(e).__name__
).inc()
async def _get_taxpayer_data(taxpayer_id: str, tenant_id: str) -> dict[str, Any]:
"""Get taxpayer data from knowledge graph"""
query = """
MATCH (t:TaxpayerProfile {taxpayer_id: $taxpayer_id, tenant_id: $tenant_id})
WHERE t.retracted_at IS NULL
RETURN t
"""
if not neo4j_client:
raise Exception("Neo4j client not initialized")
results = await neo4j_client.run_query(
query, {"taxpayer_id": taxpayer_id, "tenant_id": tenant_id}
)
if not results:
raise Exception(f"Taxpayer not found: {taxpayer_id}")
return results[0]["t"]
async def _get_latest_calculation(
taxpayer_id: str, tax_year: str, tenant_id: str
) -> dict[str, Any]:
"""Get latest calculation for taxpayer and tax year"""
query = """
MATCH (c:Calculation {taxpayer_id: $taxpayer_id, tax_year: $tax_year, tenant_id: $tenant_id})
WHERE c.retracted_at IS NULL
RETURN c
ORDER BY c.calculated_at DESC
LIMIT 1
"""
if not neo4j_client:
raise Exception("Neo4j client not initialized")
results = await neo4j_client.run_query( # fmt: skip # pyright: ignore[reportOptionalMemberAccess]
query,
{"taxpayer_id": taxpayer_id, "tax_year": tax_year, "tenant_id": tenant_id},
)
if not results:
raise Exception(
f"No calculation found for taxpayer {taxpayer_id} and tax year {tax_year}"
)
return results[0]["c"]
async def _validate_submission_data(
taxpayer_data: dict[str, Any], calculation_data: dict[str, Any]
) -> dict[str, Any]:
"""Validate submission data"""
validation_results: dict[str, bool | list[str]] = {
"valid": True,
"errors": [],
"warnings": [],
}
# Check required taxpayer fields
if not taxpayer_data.get("utr"):
validation_results["errors"].append("UTR is required")
validation_results["valid"] = False
if not taxpayer_data.get("ni_number"):
validation_results["errors"].append("National Insurance number is required")
validation_results["valid"] = False
# Check calculation data
if not calculation_data.get("schedule"):
validation_results["errors"].append("Schedule is required")
validation_results["valid"] = False
return validation_results
async def _prepare_submission_data(
taxpayer_data: dict[str, Any], calculation_data: dict[str, Any], tax_year: str
) -> dict[str, Any]:
"""Prepare data for HMRC submission"""
# This would format data according to HMRC MTD API requirements
submission_data = {
"taxYear": tax_year,
"nino": taxpayer_data.get("ni_number"),
"utr": taxpayer_data.get("utr"),
"schedule": calculation_data.get("schedule"),
"submissionTimestamp": datetime.utcnow().isoformat(),
}
return submission_data
async def _simulate_hmrc_submission(submission_data: dict[str, Any]) -> dict[str, Any]:
"""Simulate HMRC submission for dry run"""
# Simulate processing delay
await asyncio.sleep(1)
return {
"status": "accepted",
"reference": f"DRY_RUN_{ulid.new()}",
"timestamp": datetime.utcnow().isoformat(),
"dry_run": True,
}
async def _submit_to_hmrc_api(
submission_data: dict[str, Any], taxpayer_id: str, tenant_id: str
) -> dict[str, Any]:
"""Submit to actual HMRC API"""
# This would implement the actual HMRC MTD API calls
# For now, return mock response
logger.warning("Actual HMRC API submission not implemented")
return {
"status": "not_implemented",
"reference": None,
"timestamp": datetime.utcnow().isoformat(),
"error": "HMRC API integration not implemented",
}
async def _store_submission_record(
submission_id: str,
taxpayer_id: str,
tax_year: str,
tenant_id: str,
hmrc_response: dict[str, Any],
validation_results: dict[str, Any],
dry_run: bool,
) -> None:
"""Store submission record in knowledge graph"""
submission_properties = {
"submission_id": submission_id,
"taxpayer_id": taxpayer_id,
"tax_year": tax_year,
"tenant_id": tenant_id,
"status": hmrc_response.get("status"),
"hmrc_reference": hmrc_response.get("reference"),
"submission_timestamp": hmrc_response.get("timestamp"),
"validation_results": json.dumps(validation_results),
"dry_run": dry_run,
"source": "hmrc_service",
"extractor_version": "1.0.0",
"valid_from": datetime.utcnow(),
"asserted_at": datetime.utcnow(),
}
if not neo4j_client:
raise Exception("Neo4j client not initialized")
await neo4j_client.create_node("Submission", submission_properties) # fmt: skip # pyright: ignore[reportOptionalMemberAccess]
async def _store_submission_error(
submission_id: str, error_message: str, tenant_id: str
) -> None:
"""Store submission error"""
error_properties = {
"submission_id": submission_id,
"tenant_id": tenant_id,
"status": "error",
"error_message": error_message,
"submission_timestamp": datetime.utcnow().isoformat(),
"source": "hmrc_service",
"extractor_version": "1.0.0",
"valid_from": datetime.utcnow(),
"asserted_at": datetime.utcnow(),
}
if not neo4j_client:
raise Exception("Neo4j client not initialized")
await neo4j_client.create_node("Submission", error_properties) # fmt: skip # pyright: ignore[reportOptionalMemberAccess]
async def _store_oauth_state(state: str, taxpayer_id: str, tenant_id: str) -> None:
"""Store OAuth state temporarily"""
# This would use Redis or similar for temporary storage
# For now, just log
logger.debug("OAuth state stored", state=state, taxpayer_id=taxpayer_id)
async def _get_oauth_state(state: str) -> dict[str, Any] | None:
"""Get OAuth state"""
# This would retrieve from Redis
# For now, return mock data
return {"taxpayer_id": "test_taxpayer", "tenant_id": "test_tenant"}
async def _delete_oauth_state(state: str) -> None:
"""Delete OAuth state"""
# This would delete from Redis
logger.debug("OAuth state deleted", state=state)
async def _exchange_code_for_token(code: str) -> dict[str, Any]:
"""Exchange authorization code for access token"""
# This would call HMRC token endpoint
# For now, return mock token
return {
"access_token": "mock_access_token",
"refresh_token": "mock_refresh_token",
"expires_in": 3600,
"scope": "read:self-assessment write:self-assessment",
}
@app.exception_handler(HTTPException)
async def http_exception_handler(request: Request, exc: HTTPException) -> JSONResponse:
"""Handle HTTP exceptions with RFC7807 format"""
return JSONResponse(
status_code=exc.status_code,
content=ErrorResponse(
type=f"https://httpstatuses.com/{exc.status_code}",
title=exc.detail,
status=exc.status_code,
detail=exc.detail,
instance=str(request.url),
trace_id=getattr(request.state, "trace_id", None),
).model_dump(),
)
if __name__ == "__main__":
import uvicorn
uvicorn.run("main:app", host="0.0.0.0", port=8010, reload=True, log_config=None)

View File

@@ -0,0 +1,40 @@
# FastAPI and server
fastapi>=0.104.1
uvicorn[standard]>=0.24.0
pydantic>=2.5.0
# Service-specific dependencies
# OAuth and authentication
authlib>=1.2.0
oauthlib>=3.2.0
# HTTP client with OAuth support
requests-oauthlib>=1.3.0
# XML processing for HMRC APIs
lxml>=4.9.0
xmltodict>=0.13.0
# JSON Web Tokens
pyjwt>=2.8.0
# UK government API utilities
govuk-frontend-jinja>=2.8.0
# Date and time for tax years
python-dateutil>=2.8.0
# Retry mechanisms
tenacity>=8.2.0
# Rate limiting
ratelimit>=2.2.0
# API validation
marshmallow>=3.20.0
# Encryption for sensitive data
cryptography>=41.0.0
# Additional HTTP utilities
urllib3>=2.1.0

View File

@@ -0,0 +1,54 @@
# Multi-stage build for svc_ingestion
FROM python:3.12-slim AS builder
# Install build dependencies
RUN apt-get update && apt-get install -y \
build-essential \
curl \
&& rm -rf /var/lib/apt/lists/*
# Create virtual environment
RUN python -m venv /opt/venv
ENV PATH="/opt/venv/bin:$PATH"
# Copy requirements and install dependencies
# Use base requirements (no ML dependencies for ingestion service)
COPY libs/requirements-base.txt /tmp/libs-requirements.txt
COPY apps/svc_ingestion/requirements.txt /tmp/requirements.txt
RUN pip install --no-cache-dir --upgrade pip && \
pip install --no-cache-dir -r /tmp/libs-requirements.txt -r /tmp/requirements.txt
# Production stage
FROM python:3.12-slim
# Install runtime dependencies
RUN apt-get update && apt-get install -y \
curl \
&& rm -rf /var/lib/apt/lists/* \
&& groupadd -r appuser \
&& useradd -r -g appuser appuser
# Copy virtual environment from builder
COPY --from=builder /opt/venv /opt/venv
ENV PATH="/opt/venv/bin:$PATH"
# Set working directory
WORKDIR /app
# Copy application code
COPY libs/ ./libs/
COPY apps/svc_ingestion/ ./apps/svc_ingestion/
# Create non-root user and set permissions
RUN chown -R appuser:appuser /app
USER appuser
# Health check
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
CMD curl -f http://localhost:8000/healthz || exit 1
# Expose port
EXPOSE 8000
# Run the application
CMD ["python", "-m", "uvicorn", "apps.svc_ingestion.main:app", "--host", "0.0.0.0", "--port", "8000"]

View File

@@ -0,0 +1,10 @@
# FILE: apps/svc_ingestion/docker.env
VAULT_ADDR=http://vault:8200
VAULT_TOKEN=${VAULT_DEV_ROOT_TOKEN_ID:-root}
MINIO_ENDPOINT=minio:9092
POSTGRES_URL=postgresql://postgres:${POSTGRES_PASSWORD:-postgres}@postgres:5432/tax_system
REDIS_URL=redis://redis:6379
EVENT_BUS_TYPE=${EVENT_BUS_TYPE:-memory}
NATS_SERVERS=${NATS_SERVERS:-nats://nats:4222}
NATS_STREAM_NAME=${NATS_STREAM_NAME:-TAX_AGENT_EVENTS}
NATS_CONSUMER_GROUP=${NATS_CONSUMER_GROUP:-tax-agent}

351
apps/svc_ingestion/main.py Normal file
View File

@@ -0,0 +1,351 @@
"""Document upload, storage, checksum validation, metadata extraction service."""
import hashlib
import mimetypes
import os
# Import shared libraries
import sys
from datetime import UTC, datetime
from typing import Any, cast
import structlog
import ulid
from fastapi import Depends, File, HTTPException, Request, UploadFile
sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
from libs.app_factory import create_app, get_tenant_dependency, get_user_dependency
from libs.config import BaseAppSettings, create_event_bus, create_minio_client
from libs.events import EventBus, EventPayload, EventTopics
from libs.observability import get_metrics, get_tracer
from libs.schemas import DocumentKind, DocumentUploadResponse
from libs.storage import DocumentStorage, StorageClient
logger = structlog.get_logger()
class IngestionSettings(BaseAppSettings):
"""Settings for ingestion service"""
service_name: str = "svc-ingestion"
# File upload limits
max_file_size: int = 50 * 1024 * 1024 # 50MB
allowed_mime_types: list[str] = [
"application/pdf",
"image/jpeg",
"image/png",
"image/tiff",
"text/csv",
"application/vnd.ms-excel",
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
]
# Storage configuration
raw_documents_bucket: str = "raw-documents"
evidence_bucket: str = "evidence"
# Global clients (will be initialized in startup)
storage_client: StorageClient | None = None
document_storage: DocumentStorage | None = None
event_bus: EventBus | None = None
# Settings will be initialized after app creation
settings: IngestionSettings
def init_dependencies(app_settings: IngestionSettings) -> None:
"""Initialize service dependencies"""
global storage_client, document_storage, event_bus, settings
settings = app_settings
logger.info(
"Starting ingestion service",
minio_endpoint=settings.minio_endpoint,
minio_access_key=settings.minio_access_key,
)
# Initialize clients
minio_client = create_minio_client(settings)
storage_client = StorageClient(minio_client)
document_storage = DocumentStorage(storage_client)
event_bus = create_event_bus(settings)
logger.info("Ingestion service started successfully")
# Create app and settings
app, _settings = create_app(
service_name="svc-ingestion",
title="Tax Agent Ingestion Service",
description="Document upload and storage service",
settings_class=IngestionSettings,
)
# Initialize dependencies immediately
init_dependencies(cast(IngestionSettings, _settings))
# Get observability components
tracer = get_tracer("svc-ingestion")
metrics = get_metrics("svc-ingestion")
# Health endpoints are provided by app_factory
@app.post("/upload", response_model=DocumentUploadResponse)
async def upload_document(
request: Request,
file: UploadFile = File(...),
kind: DocumentKind = DocumentKind.INVOICE,
source: str = "manual_upload",
current_user: dict[str, Any] = Depends(get_user_dependency()),
tenant_id: str = Depends(get_tenant_dependency()),
) -> DocumentUploadResponse:
"""Upload document for processing"""
# Check if services are initialized
if document_storage is None or event_bus is None:
raise HTTPException(
status_code=503, detail="Service not ready - dependencies not initialized"
)
with tracer.start_as_current_span("upload_document") as span:
span.set_attribute("tenant_id", tenant_id)
span.set_attribute("document_kind", kind.value)
span.set_attribute("source", source)
try:
# Validate file
await _validate_upload(file)
# Generate document ID
doc_id = f"doc_{ulid.new()}"
span.set_attribute("doc_id", doc_id)
# Read file content
content = await file.read()
# Calculate checksum
checksum = hashlib.sha256(content).hexdigest()
# Detect MIME type
detected_mime = None
if file.filename:
detected_mime = mimetypes.guess_type(file.filename)[0]
content_type = (
detected_mime or file.content_type or "application/octet-stream"
)
# Store document
storage_result = await document_storage.store_document(
tenant_id=tenant_id,
doc_id=doc_id,
content=content,
content_type=content_type,
metadata={
"original_filename": file.filename or "unknown",
"kind": kind.value,
"source": source,
"uploaded_by": current_user.get("sub", "unknown"),
"uploaded_at": datetime.now(UTC).isoformat(),
},
)
# Publish event
event_payload = EventPayload(
data={
"doc_id": doc_id,
"tenant_id": tenant_id,
"kind": kind.value,
"source": source,
"checksum": checksum,
"file_size": len(content),
"content_type": content_type,
"s3_url": storage_result["s3_url"],
},
actor=current_user.get("sub", "system"),
tenant_id=tenant_id,
trace_id=str(span.get_span_context().trace_id),
)
await event_bus.publish(EventTopics.DOC_INGESTED, event_payload)
# Update metrics
metrics.counter(
"documents_uploaded_total", labelnames=["tenant_id", "kind", "source"]
).labels(tenant_id=tenant_id, kind=kind.value, source=source).inc()
metrics.histogram(
"document_size_bytes", labelnames=["tenant_id", "kind"]
).labels(tenant_id=tenant_id, kind=kind.value).observe(len(content))
logger.info(
"Document uploaded successfully",
doc_id=doc_id,
tenant_id=tenant_id,
kind=kind.value,
size=len(content),
checksum=checksum,
)
return DocumentUploadResponse(
doc_id=doc_id, s3_url=storage_result["s3_url"], checksum=checksum
)
except ValueError as e:
logger.warning("Upload validation failed", error=str(e))
# Track validation errors
try:
metrics.counter(
"upload_errors_total", labelnames=["tenant_id", "error_type"]
).labels(tenant_id=tenant_id, error_type="ValueError").inc()
except Exception:
pass # Don't fail on metrics errors
raise HTTPException(status_code=400, detail=str(e))
except Exception as e:
logger.error("Upload failed", error=str(e))
# Track upload errors
try:
metrics.counter(
"upload_errors_total", labelnames=["tenant_id", "error_type"]
).labels(tenant_id=tenant_id, error_type=type(e).__name__).inc()
except Exception:
pass # Don't fail on metrics errors
raise HTTPException(status_code=500, detail="Upload failed")
@app.get("/documents/{doc_id}")
async def get_document_info(
doc_id: str,
current_user: dict[str, Any] = Depends(get_user_dependency()),
tenant_id: str = Depends(get_tenant_dependency()),
) -> dict[str, str]:
"""Get document information"""
# Check if services are initialized
if storage_client is None:
raise HTTPException(
status_code=503, detail="Service not ready - dependencies not initialized"
)
with tracer.start_as_current_span("get_document_info") as span:
span.set_attribute("doc_id", doc_id)
span.set_attribute("tenant_id", tenant_id)
try:
# Check if document exists
ingestion_settings = cast(IngestionSettings, settings)
bucket_name = ingestion_settings.raw_documents_bucket
object_key = f"tenants/{tenant_id}/raw/{doc_id}.pdf"
exists = await storage_client.object_exists(bucket_name, object_key)
if not exists:
raise HTTPException(status_code=404, detail="Document not found")
# Get presigned URL for download
download_url = await storage_client.get_presigned_url(
bucket_name=bucket_name, object_name=object_key, method="GET"
)
if not download_url:
raise HTTPException(
status_code=500, detail="Failed to generate download URL"
)
return {
"doc_id": doc_id,
"download_url": download_url,
"s3_url": f"s3://{bucket_name}/{object_key}",
}
except HTTPException:
raise
except Exception as e:
logger.error("Failed to get document info", doc_id=doc_id, error=str(e))
raise HTTPException(status_code=500, detail="Failed to get document info")
@app.delete("/documents/{doc_id}")
async def delete_document(
doc_id: str,
current_user: dict[str, Any] = Depends(get_user_dependency()),
tenant_id: str = Depends(get_tenant_dependency()),
) -> dict[str, str]:
"""Delete document"""
# Check if services are initialized
if storage_client is None:
raise HTTPException(
status_code=503, detail="Service not ready - dependencies not initialized"
)
with tracer.start_as_current_span("delete_document") as span:
span.set_attribute("doc_id", doc_id)
span.set_attribute("tenant_id", tenant_id)
try:
# Delete from storage
ingestion_settings = cast(IngestionSettings, settings)
bucket_name = ingestion_settings.raw_documents_bucket
object_key = f"tenants/{tenant_id}/raw/{doc_id}.pdf"
success = await storage_client.delete_object(bucket_name, object_key)
if not success:
raise HTTPException(status_code=404, detail="Document not found")
logger.info("Document deleted", doc_id=doc_id, tenant_id=tenant_id)
return {"message": "Document deleted successfully"}
except HTTPException:
raise
except Exception as e:
logger.error("Failed to delete document", doc_id=doc_id, error=str(e))
raise HTTPException(status_code=500, detail="Failed to delete document")
async def _validate_upload(file: UploadFile) -> None:
"""Validate uploaded file"""
# Cast settings to the correct type
ingestion_settings = cast(IngestionSettings, settings)
# Check file size
if file.size and file.size > ingestion_settings.max_file_size:
raise ValueError(
f"File too large: {file.size} bytes (max: {ingestion_settings.max_file_size})"
)
# Check MIME type
if file.content_type not in ingestion_settings.allowed_mime_types:
# Try to detect MIME type from filename
detected_mime = None
if file.filename:
detected_mime = mimetypes.guess_type(file.filename)[0]
if detected_mime not in ingestion_settings.allowed_mime_types:
raise ValueError(f"Unsupported file type: {file.content_type}")
# Check filename
if not file.filename:
raise ValueError("Filename is required")
# Check for malicious filenames
if ".." in file.filename or "/" in file.filename or "\\" in file.filename:
raise ValueError("Invalid filename")
if __name__ == "__main__":
import uvicorn
uvicorn.run(
"main:app",
host="0.0.0.0",
port=8000,
reload=True,
log_config=None, # Use structlog configuration
)

View File

@@ -0,0 +1,9 @@
# Service-specific dependencies for svc_ingestion
# File upload and processing
aiofiles>=23.2.0
# MIME type detection
python-magic>=0.4.27
# Image processing (for thumbnails) - lightweight
Pillow>=10.1.0

54
apps/svc_kg/Dockerfile Normal file
View File

@@ -0,0 +1,54 @@
# Multi-stage build for svc_kg
FROM python:3.12-slim AS builder
# Install build dependencies
RUN apt-get update && apt-get install -y \
build-essential \
curl \
&& rm -rf /var/lib/apt/lists/*
# Create virtual environment
RUN python -m venv /opt/venv
ENV PATH="/opt/venv/bin:$PATH"
# Copy requirements and install dependencies
COPY libs/requirements-base.txt /tmp/libs-requirements.txt
COPY libs/requirements-rdf.txt /tmp/libs-rdf.txt
COPY apps/svc_kg/requirements.txt /tmp/requirements.txt
RUN pip install --no-cache-dir --upgrade pip && \
pip install --no-cache-dir -r /tmp/libs-requirements.txt -r /tmp/libs-rdf.txt -r /tmp/requirements.txt
# Production stage
FROM python:3.12-slim
# Install runtime dependencies
RUN apt-get update && apt-get install -y \
curl \
&& rm -rf /var/lib/apt/lists/* \
&& groupadd -r appuser \
&& useradd -r -g appuser appuser
# Copy virtual environment from builder
COPY --from=builder /opt/venv /opt/venv
ENV PATH="/opt/venv/bin:$PATH"
# Set working directory
WORKDIR /app
# Copy application code
COPY libs/ ./libs/
COPY apps/svc_kg/ ./apps/svc_kg/
# Create non-root user and set permissions
RUN chown -R appuser:appuser /app
USER appuser
# Health check
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
CMD curl -f http://localhost:8000/healthz || exit 1
# Expose port
EXPOSE 8000
# Run the application
CMD ["python", "-m", "uvicorn", "apps.svc_kg.main:app", "--host", "0.0.0.0", "--port", "8000"]

572
apps/svc_kg/main.py Normal file
View File

@@ -0,0 +1,572 @@
# FILE: apps/svc-kg/main.py
# Knowledge graph facade with CRUD, queries, lineage, and SHACL validation
import json
import os
# Import shared libraries
import sys
from datetime import datetime
from typing import Any
import structlog
from fastapi import Depends, HTTPException, Query, Request
from fastapi.responses import JSONResponse
sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
from libs.app_factory import create_app
from libs.config import BaseAppSettings, create_event_bus, create_neo4j_client
from libs.events import EventBus
from libs.neo import Neo4jClient, SHACLValidator, TemporalQueries
from libs.observability import get_metrics, get_tracer, setup_observability
from libs.schemas import ErrorResponse
from libs.security import get_current_user, get_tenant_id
logger = structlog.get_logger()
class KGSettings(BaseAppSettings):
"""Settings for KG service"""
service_name: str = "svc-kg"
# SHACL validation
shapes_file: str = "schemas/shapes.ttl"
validate_on_write: bool = True
# Query limits
max_results: int = 1000
max_depth: int = 10
query_timeout: int = 30
# Create app and settings
app, settings = create_app(
service_name="svc-kg",
title="Tax Agent Knowledge Graph Service",
description="Knowledge graph facade with CRUD and queries",
settings_class=KGSettings,
)
# Global clients
neo4j_client: Neo4jClient | None = None
shacl_validator: SHACLValidator | None = None
event_bus: EventBus | None = None
tracer = get_tracer("svc-kg")
metrics = get_metrics()
@app.on_event("startup")
async def startup_event() -> None:
"""Initialize service dependencies"""
global neo4j_client, shacl_validator, event_bus
logger.info("Starting KG service")
# Setup observability
setup_observability(settings)
# Initialize Neo4j client
neo4j_driver = create_neo4j_client(settings)
neo4j_client = Neo4jClient(neo4j_driver)
# Initialize SHACL validator
if os.path.exists(settings.shapes_file):
shacl_validator = SHACLValidator(settings.shapes_file)
# Initialize event bus
event_bus = create_event_bus(settings)
await event_bus.start()
logger.info("KG service started successfully")
@app.on_event("shutdown")
async def shutdown_event() -> None:
"""Cleanup service dependencies"""
global neo4j_client, event_bus
logger.info("Shutting down KG service")
if neo4j_client:
await neo4j_client.close()
if event_bus:
await event_bus.stop()
logger.info("KG service shutdown complete")
@app.get("/health")
async def health_check() -> dict[str, Any]:
"""Health check endpoint"""
return {
"status": "healthy",
"service": settings.service_name,
"version": settings.service_version,
"timestamp": datetime.utcnow().isoformat(),
}
@app.post("/nodes/{label}")
async def create_node(
label: str,
properties: dict[str, Any],
current_user: dict[str, Any] = Depends(get_current_user),
tenant_id: str = Depends(get_tenant_id),
) -> dict[str, Any]:
"""Create a new node"""
with tracer.start_as_current_span("create_node") as span:
span.set_attribute("label", label)
span.set_attribute("tenant_id", tenant_id)
try:
# Add tenant isolation
properties["tenant_id"] = tenant_id
properties["created_by"] = current_user.get("sub", "system")
# Validate with SHACL if enabled
if settings.validate_on_write and shacl_validator:
await _validate_node(label, properties)
# Create node
result = await neo4j_client.create_node(label, properties)
# Update metrics
metrics.counter("nodes_created_total").labels(
tenant_id=tenant_id, label=label
).inc()
logger.info("Node created", label=label, node_id=result.get("id"))
return {
"status": "created",
"label": label,
"properties": properties,
"neo4j_result": result,
}
except Exception as e:
logger.error("Failed to create node", label=label, error=str(e))
raise HTTPException(
status_code=500, detail=f"Failed to create node: {str(e)}"
)
@app.get("/nodes/{label}")
async def get_nodes(
label: str,
limit: int = Query(default=100, le=settings.max_results),
filters: str | None = Query(default=None),
current_user: dict[str, Any] = Depends(get_current_user),
tenant_id: str = Depends(get_tenant_id),
) -> dict[str, Any]:
"""Get nodes by label with optional filters"""
with tracer.start_as_current_span("get_nodes") as span:
span.set_attribute("label", label)
span.set_attribute("tenant_id", tenant_id)
span.set_attribute("limit", limit)
try:
# Parse filters
filter_dict: dict[str, Any] = {}
if filters:
try:
filter_dict = json.loads(filters)
except json.JSONDecodeError:
raise HTTPException(status_code=400, detail="Invalid filters JSON")
# Add tenant isolation
filter_dict["tenant_id"] = tenant_id
# Build query
query = TemporalQueries.get_current_state_query(label, filter_dict)
query += f" LIMIT {limit}"
# Execute query
results = await neo4j_client.run_query(query)
# Update metrics
metrics.counter("nodes_queried_total").labels(
tenant_id=tenant_id, label=label
).inc()
return {
"label": label,
"count": len(results),
"nodes": [result["n"] for result in results],
}
except HTTPException:
raise
except Exception as e:
logger.error("Failed to get nodes", label=label, error=str(e))
raise HTTPException(
status_code=500, detail=f"Failed to get nodes: {str(e)}"
)
@app.get("/nodes/{label}/{node_id}")
async def get_node(
label: str,
node_id: str,
include_lineage: bool = Query(default=False),
current_user: dict[str, Any] = Depends(get_current_user),
tenant_id: str = Depends(get_tenant_id),
) -> dict[str, Any]:
"""Get specific node with optional lineage"""
with tracer.start_as_current_span("get_node") as span:
span.set_attribute("label", label)
span.set_attribute("node_id", node_id)
span.set_attribute("tenant_id", tenant_id)
try:
# Get node
query = f"""
MATCH (n:{label} {{id: $node_id, tenant_id: $tenant_id}})
WHERE n.retracted_at IS NULL
RETURN n
"""
results = await neo4j_client.run_query(
query, {"node_id": node_id, "tenant_id": tenant_id}
)
if not results:
raise HTTPException(status_code=404, detail="Node not found")
node_data = results[0]["n"]
# Get lineage if requested
lineage: list[dict[str, Any]] = []
if include_lineage:
lineage = await neo4j_client.get_node_lineage(node_id)
return {"node": node_data, "lineage": lineage if include_lineage else None}
except HTTPException:
raise
except Exception as e:
logger.error(
"Failed to get node", label=label, node_id=node_id, error=str(e)
)
raise HTTPException(status_code=500, detail=f"Failed to get node: {str(e)}")
@app.put("/nodes/{label}/{node_id}")
async def update_node(
label: str,
node_id: str,
properties: dict[str, Any],
current_user: dict[str, Any] = Depends(get_current_user),
tenant_id: str = Depends(get_tenant_id),
) -> dict[str, Any]:
"""Update node with bitemporal versioning"""
with tracer.start_as_current_span("update_node") as span:
span.set_attribute("label", label)
span.set_attribute("node_id", node_id)
span.set_attribute("tenant_id", tenant_id)
try:
# Add metadata
properties["tenant_id"] = tenant_id
properties["updated_by"] = current_user.get("sub", "system")
# Validate with SHACL if enabled
if settings.validate_on_write and shacl_validator:
await _validate_node(label, properties)
# Update node (creates new version)
await neo4j_client.update_node(label, node_id, properties)
# Update metrics
metrics.counter("nodes_updated_total").labels(
tenant_id=tenant_id, label=label
).inc()
logger.info("Node updated", label=label, node_id=node_id)
return {
"status": "updated",
"label": label,
"node_id": node_id,
"properties": properties,
}
except Exception as e:
logger.error(
"Failed to update node", label=label, node_id=node_id, error=str(e)
)
raise HTTPException(
status_code=500, detail=f"Failed to update node: {str(e)}"
)
@app.post("/relationships")
async def create_relationship(
from_label: str,
from_id: str,
to_label: str,
to_id: str,
relationship_type: str,
properties: dict[str, Any] | None = None,
current_user: dict[str, Any] = Depends(get_current_user),
tenant_id: str = Depends(get_tenant_id),
) -> dict[str, Any]:
"""Create relationship between nodes"""
with tracer.start_as_current_span("create_relationship") as span:
span.set_attribute("from_label", from_label)
span.set_attribute("to_label", to_label)
span.set_attribute("relationship_type", relationship_type)
span.set_attribute("tenant_id", tenant_id)
try:
# Add metadata
rel_properties = properties or {}
rel_properties["tenant_id"] = tenant_id
rel_properties["created_by"] = current_user.get("sub", "system")
# Create relationship
await neo4j_client.create_relationship(
from_label, from_id, to_label, to_id, relationship_type, rel_properties
)
# Update metrics
metrics.counter("relationships_created_total").labels(
tenant_id=tenant_id, relationship_type=relationship_type
).inc()
logger.info(
"Relationship created",
from_id=from_id,
to_id=to_id,
type=relationship_type,
)
return {
"status": "created",
"from_id": from_id,
"to_id": to_id,
"relationship_type": relationship_type,
"properties": rel_properties,
}
except Exception as e:
logger.error("Failed to create relationship", error=str(e))
raise HTTPException(
status_code=500, detail=f"Failed to create relationship: {str(e)}"
)
@app.post("/query")
async def execute_query(
query: str,
parameters: dict[str, Any] | None = None,
current_user: dict[str, Any] = Depends(get_current_user),
tenant_id: str = Depends(get_tenant_id),
) -> dict[str, Any]:
"""Execute custom Cypher query with tenant isolation"""
with tracer.start_as_current_span("execute_query") as span:
span.set_attribute("tenant_id", tenant_id)
try:
# Add tenant isolation to parameters
query_params = parameters or {}
query_params["tenant_id"] = tenant_id
# Validate query (basic security check)
if not _is_safe_query(query):
raise HTTPException(status_code=400, detail="Unsafe query detected")
# Execute query with timeout
results = await neo4j_client.run_query(query, query_params, max_retries=1)
# Update metrics
metrics.counter("custom_queries_total").labels(tenant_id=tenant_id).inc()
return {
"query": query,
"parameters": query_params,
"results": results,
"count": len(results),
}
except Exception as e:
logger.error("Query execution failed", query=query[:100], error=str(e))
raise HTTPException(status_code=500, detail=f"Query failed: {str(e)}")
@app.get("/export/rdf")
async def export_rdf(
format: str = Query(default="turtle"),
current_user: dict[str, Any] = Depends(get_current_user),
tenant_id: str = Depends(get_tenant_id),
) -> dict[str, Any]:
"""Export knowledge graph as RDF"""
with tracer.start_as_current_span("export_rdf") as span:
span.set_attribute("format", format)
span.set_attribute("tenant_id", tenant_id)
try:
# Export tenant-specific data
rdf_data = await neo4j_client.export_to_rdf(format)
# Update metrics
metrics.counter("rdf_exports_total").labels(
tenant_id=tenant_id, format=format
).inc()
return {
"format": format,
"rdf_data": rdf_data,
"exported_at": datetime.utcnow().isoformat(),
}
except Exception as e:
logger.error("RDF export failed", format=format, error=str(e))
raise HTTPException(
status_code=500, detail=f"RDF export failed: {str(e)}"
) from e
@app.post("/validate")
async def validate_graph(
current_user: dict[str, Any] = Depends(get_current_user),
tenant_id: str = Depends(get_tenant_id),
) -> dict[str, Any]:
"""Validate knowledge graph with SHACL"""
with tracer.start_as_current_span("validate_graph") as span:
span.set_attribute("tenant_id", tenant_id)
try:
if not shacl_validator:
raise HTTPException(
status_code=501, detail="SHACL validation not configured"
)
# Export current graph state
rdf_export = await neo4j_client.export_to_rdf("turtle")
# Extract RDF data from export result
rdf_data = rdf_export.get("rdf_data", "")
if not rdf_data:
raise HTTPException(
status_code=500, detail="Failed to export RDF data for validation"
)
# Run SHACL validation
validation_result = await shacl_validator.validate_graph(rdf_data)
# Update metrics
metrics.counter("validations_total").labels(
tenant_id=tenant_id, conforms=validation_result["conforms"]
).inc()
return {
"conforms": validation_result["conforms"],
"violations_count": validation_result["violations_count"],
"results_text": validation_result["results_text"],
"validated_at": datetime.utcnow().isoformat(),
}
except Exception as e:
logger.error("Graph validation failed", error=str(e))
raise HTTPException(status_code=500, detail=f"Validation failed: {str(e)}")
async def _validate_node(label: str, properties: dict[str, Any]) -> bool:
"""Validate node with SHACL"""
if not shacl_validator:
return True
try:
# Create a minimal RDF representation of the node for validation
rdf_lines = ["@prefix tax: <https://tax-kg.example.com/> ."]
node_uri = "tax:temp_node"
# Add type declaration
rdf_lines.append(f"{node_uri} a tax:{label} .")
# Add properties
for prop, value in properties.items():
if isinstance(value, str):
rdf_lines.append(f'{node_uri} tax:{prop} "{value}" .')
else:
rdf_lines.append(f"{node_uri} tax:{prop} {value} .")
rdf_data = "\n".join(rdf_lines)
# Validate the node RDF data
validation_result = await shacl_validator.validate_graph(rdf_data)
if not validation_result["conforms"]:
logger.warning(
"Node SHACL validation failed",
label=label,
violations=validation_result["violations_count"],
details=validation_result["results_text"],
)
return False
logger.debug("Node SHACL validation passed", label=label)
return True
except Exception as e:
logger.error("Node SHACL validation error", label=label, error=str(e))
# Return True to not block operations on validation errors
return True
def _is_safe_query(query: str) -> bool:
"""Basic query safety check"""
query_lower = query.lower()
# Block dangerous operations
dangerous_keywords = [
"delete",
"remove",
"drop",
"create index",
"create constraint",
"load csv",
"call",
"foreach",
]
for keyword in dangerous_keywords:
if keyword in query_lower:
return False
return True
@app.exception_handler(HTTPException)
async def http_exception_handler(request: Request, exc: HTTPException) -> JSONResponse:
"""Handle HTTP exceptions with RFC7807 format"""
return JSONResponse(
status_code=exc.status_code,
content=ErrorResponse(
type=f"https://httpstatuses.com/{exc.status_code}",
title=exc.detail,
status=exc.status_code,
detail=exc.detail,
instance=str(request.url),
trace_id="",
).model_dump(),
)
if __name__ == "__main__":
import uvicorn
uvicorn.run("main:app", host="0.0.0.0", port=8005, reload=True, log_config=None)

View File

@@ -0,0 +1,22 @@
# Service-specific dependencies
# RDF and semantic web
rdflib>=7.0.0
pyshacl>=0.25.0
# Graph algorithms
networkx>=3.2.0
# Data export formats
xmltodict>=0.13.0
# Query optimization
pyparsing>=3.1.0
# Graph visualization (optional)
graphviz>=0.20.0
# Additional Neo4j utilities
neomodel>=5.2.0
# Cypher query building
py2neo>=2021.2.4

View File

@@ -0,0 +1,53 @@
# Multi-stage build for svc_normalize_map
FROM python:3.12-slim AS builder
# Install build dependencies
RUN apt-get update && apt-get install -y \
build-essential \
curl \
&& rm -rf /var/lib/apt/lists/*
# Create virtual environment
RUN python -m venv /opt/venv
ENV PATH="/opt/venv/bin:$PATH"
# Copy requirements and install dependencies
COPY libs/requirements-base.txt /tmp/libs-requirements.txt
COPY apps/svc_normalize_map/requirements.txt /tmp/requirements.txt
RUN pip install --no-cache-dir --upgrade pip && \
pip install --no-cache-dir -r /tmp/libs-requirements.txt -r /tmp/requirements.txt
# Production stage
FROM python:3.12-slim
# Install runtime dependencies
RUN apt-get update && apt-get install -y \
curl \
&& rm -rf /var/lib/apt/lists/* \
&& groupadd -r appuser \
&& useradd -r -g appuser appuser
# Copy virtual environment from builder
COPY --from=builder /opt/venv /opt/venv
ENV PATH="/opt/venv/bin:$PATH"
# Set working directory
WORKDIR /app
# Copy application code
COPY libs/ ./libs/
COPY apps/svc_normalize_map/ ./apps/svc_normalize_map/
# Create non-root user and set permissions
RUN chown -R appuser:appuser /app
USER appuser
# Health check
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
CMD curl -f http://localhost:8000/healthz || exit 1
# Expose port
EXPOSE 8000
# Run the application
CMD ["python", "-m", "uvicorn", "apps.svc_normalize_map.main:app", "--host", "0.0.0.0", "--port", "8000"]

View File

@@ -0,0 +1,590 @@
"""Data normalization and knowledge graph mapping."""
# FILE: apps/svc-normalize-map/main.py
# pylint: disable=wrong-import-position,import-error,too-few-public-methods,global-statement
# pylint: disable=global-variable-not-assigned,raise-missing-from,unused-argument
# pylint: disable=broad-exception-caught,no-else-return,too-many-arguments,too-many-positional-arguments
# pylint: disable=too-many-locals,import-outside-toplevel,too-many-statements
# mypy: disable-error-code=union-attr
import os
# Import shared libraries
import sys
from datetime import datetime
from decimal import Decimal
from typing import Any
import structlog
import ulid
from fastapi import BackgroundTasks, Depends, HTTPException, Request
from fastapi.responses import JSONResponse
sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
from libs.app_factory import create_app
from libs.config import (
BaseAppSettings,
create_event_bus,
create_minio_client,
create_neo4j_client,
)
from libs.events import EventBus, EventPayload, EventTopics
from libs.neo import Neo4jClient
from libs.observability import get_metrics, get_tracer, setup_observability
from libs.schemas import ErrorResponse
from libs.security import get_current_user, get_tenant_id
from libs.storage import DocumentStorage, StorageClient
logger = structlog.get_logger()
class NormalizeMapSettings(BaseAppSettings):
"""Settings for normalize-map service"""
service_name: str = "svc-normalize-map"
# Normalization configuration
currency_default: str = "GBP"
date_formats: list[str] = [
"%Y-%m-%d",
"%d/%m/%Y",
"%d-%m-%Y",
"%d %B %Y",
"%d %b %Y",
"%B %d, %Y",
]
# Mapping configuration
confidence_threshold: float = 0.7
auto_create_entities: bool = True
# Validation rules
max_amount: float = 1000000.0 # £1M
min_confidence: float = 0.5
# Create app and settings
app, settings = create_app(
service_name="svc-normalize-map",
title="Tax Agent Normalize-Map Service",
description="Data normalization and knowledge graph mapping service",
settings_class=NormalizeMapSettings,
)
# Global clients
storage_client: StorageClient | None = None
document_storage: DocumentStorage | None = None
neo4j_client: Neo4jClient | None = None
event_bus: EventBus | None = None
tracer = get_tracer("svc-normalize-map")
metrics = get_metrics()
@app.on_event("startup")
async def startup_event() -> None:
"""Initialize service dependencies"""
global storage_client, document_storage, neo4j_client, event_bus
logger.info("Starting normalize-map service")
# Setup observability
setup_observability(settings)
# Initialize MinIO client
minio_client = create_minio_client(settings)
storage_client = StorageClient(minio_client)
document_storage = DocumentStorage(storage_client)
# Initialize Neo4j client
neo4j_driver = create_neo4j_client(settings)
neo4j_client = Neo4jClient(neo4j_driver)
# Initialize event bus
event_bus = create_event_bus(settings)
await event_bus.start()
# Subscribe to extraction completion events
await event_bus.subscribe( # type: ignore
EventTopics.DOC_EXTRACTED, _handle_extraction_completed
)
logger.info("Normalize-map service started successfully")
@app.on_event("shutdown")
async def shutdown_event() -> None:
"""Cleanup service dependencies"""
global event_bus, neo4j_client
logger.info("Shutting down normalize-map service")
if neo4j_client:
await neo4j_client.close()
if event_bus:
await event_bus.stop()
logger.info("Normalize-map service shutdown complete")
@app.get("/health")
async def health_check() -> dict[str, Any]:
"""Health check endpoint"""
return {
"status": "healthy",
"service": settings.service_name,
"version": settings.service_version,
"timestamp": datetime.utcnow().isoformat(),
}
@app.post("/normalize/{doc_id}")
async def normalize_document(
doc_id: str,
background_tasks: BackgroundTasks,
current_user: dict[str, Any] = Depends(get_current_user),
tenant_id: str = Depends(get_tenant_id),
) -> dict[str, Any]:
"""Normalize and map document data to knowledge graph"""
with tracer.start_as_current_span("normalize_document") as span:
span.set_attribute("doc_id", doc_id)
span.set_attribute("tenant_id", tenant_id)
try:
# Check if extraction results exist
extraction_results = await document_storage.get_extraction_result(
tenant_id, doc_id
)
if not extraction_results:
raise HTTPException(
status_code=404, detail="Extraction results not found"
)
# Generate normalization ID
normalization_id = str(ulid.new())
span.set_attribute("normalization_id", normalization_id)
# Start background normalization
background_tasks.add_task(
_normalize_and_map_async,
doc_id,
tenant_id,
extraction_results,
normalization_id,
current_user.get("sub", "system"),
)
logger.info(
"Normalization started",
doc_id=doc_id,
normalization_id=normalization_id,
)
return {
"normalization_id": normalization_id,
"doc_id": doc_id,
"status": "processing",
}
except HTTPException:
raise
except Exception as e:
logger.error("Failed to start normalization", doc_id=doc_id, error=str(e))
raise HTTPException(status_code=500, detail="Failed to start normalization")
async def _handle_extraction_completed(topic: str, payload: EventPayload) -> None:
"""Handle extraction completion events"""
try:
data = payload.data
doc_id = data.get("doc_id")
tenant_id = data.get("tenant_id")
confidence = data.get("confidence", 0.0)
if not doc_id or not tenant_id:
logger.warning("Invalid extraction completion event", data=data)
return
# Only auto-process if confidence is above threshold
if confidence >= settings.confidence_threshold:
logger.info(
"Auto-normalizing extracted document",
doc_id=doc_id,
confidence=confidence,
)
extraction_results = data.get("extraction_results")
if not extraction_results:
extraction_results = await document_storage.get_extraction_result(
tenant_id, doc_id
)
if extraction_results:
await _normalize_and_map_async(
doc_id=doc_id,
tenant_id=tenant_id,
extraction_results=extraction_results,
normalization_id=str(ulid.new()),
actor=payload.actor,
)
else:
logger.info(
"Skipping auto-normalization due to low confidence",
doc_id=doc_id,
confidence=confidence,
)
except Exception as e:
logger.error("Failed to handle extraction completion", error=str(e))
async def _normalize_and_map_async(
doc_id: str,
tenant_id: str,
extraction_results: dict[str, Any],
normalization_id: str,
actor: str,
) -> None:
"""Normalize and map data asynchronously"""
with tracer.start_as_current_span("normalize_and_map_async") as span:
span.set_attribute("doc_id", doc_id)
span.set_attribute("normalization_id", normalization_id)
try:
extracted_fields = extraction_results.get("extracted_fields", {})
provenance = extraction_results.get("provenance", [])
# Normalize extracted data
normalized_data = await _normalize_data(extracted_fields, provenance)
# Map to knowledge graph entities
entities = await _map_to_entities(normalized_data, doc_id, tenant_id)
# Store entities in knowledge graph
stored_entities = await _store_entities(entities, tenant_id)
# Create normalization results
normalization_results = {
"doc_id": doc_id,
"normalization_id": normalization_id,
"normalized_at": datetime.utcnow().isoformat(),
"normalized_data": normalized_data,
"entities": stored_entities,
"entity_count": len(stored_entities),
}
logger.info("Normalization completed", results=normalization_results)
# Update metrics
metrics.counter("documents_normalized_total").labels(
tenant_id=tenant_id
).inc()
metrics.histogram("entities_created").labels(tenant_id=tenant_id).observe(
len(stored_entities)
)
# Publish completion event
event_payload = EventPayload(
data={
"doc_id": doc_id,
"tenant_id": tenant_id,
"normalization_id": normalization_id,
"entity_count": len(stored_entities),
"entities": stored_entities,
},
actor=actor,
tenant_id=tenant_id,
)
await event_bus.publish(EventTopics.KG_UPSERTED, event_payload)
logger.info(
"Normalization completed", doc_id=doc_id, entities=len(stored_entities)
)
except Exception as e:
logger.error("Normalization failed", doc_id=doc_id, error=str(e))
# Update error metrics
metrics.counter("normalization_errors_total").labels(
tenant_id=tenant_id, error_type=type(e).__name__
).inc()
async def _normalize_data(
extracted_fields: dict[str, Any], provenance: list[dict[str, Any]]
) -> dict[str, Any]:
"""Normalize extracted data"""
normalized = {}
for field_name, raw_value in extracted_fields.items():
try:
if "amount" in field_name.lower() or "total" in field_name.lower():
normalized[field_name] = _normalize_amount(raw_value)
elif "date" in field_name.lower():
normalized[field_name] = _normalize_date(raw_value)
elif "name" in field_name.lower():
normalized[field_name] = _normalize_name(raw_value)
elif "address" in field_name.lower():
normalized[field_name] = _normalize_address(raw_value)
elif "number" in field_name.lower():
normalized[field_name] = _normalize_number(raw_value)
else:
normalized[field_name] = _normalize_text(raw_value)
except Exception as e:
logger.warning(
"Failed to normalize field",
field=field_name,
value=raw_value,
error=str(e),
)
normalized[field_name] = raw_value # Keep original value
return normalized
def _normalize_amount(value: str) -> dict[str, Any]:
"""Normalize monetary amount"""
import re
if not value:
return {"amount": None, "currency": settings.currency_default}
# Remove currency symbols and formatting
clean_value = re.sub(r"[£$€,\s]", "", str(value))
try:
amount = Decimal(clean_value)
# Validate amount
if amount > settings.max_amount:
logger.warning("Amount exceeds maximum", amount=amount)
return {
"amount": float(amount),
"currency": settings.currency_default,
"original": value,
}
except Exception:
return {
"amount": None,
"currency": settings.currency_default,
"original": value,
}
def _normalize_date(value: str) -> dict[str, Any]:
"""Normalize date"""
from dateutil import parser
if not value:
return {"date": None, "original": value}
try:
# Try parsing with dateutil first
parsed_date = parser.parse(str(value), dayfirst=True)
return {"date": parsed_date.date().isoformat(), "original": value}
except Exception:
# Try manual formats
for fmt in settings.date_formats:
try:
parsed_date = datetime.strptime(str(value), fmt)
return {"date": parsed_date.date().isoformat(), "original": value}
except Exception:
continue
return {"date": None, "original": value}
def _normalize_name(value: str) -> dict[str, Any]:
"""Normalize person/company name"""
if not value:
return {"name": None, "original": value}
# Clean and title case
clean_name = str(value).strip().title()
# Detect if it's a company (contains Ltd, Limited, etc.)
company_indicators = ["Ltd", "Limited", "Plc", "Inc", "Corp", "Company"]
is_company = any(indicator in clean_name for indicator in company_indicators)
return {
"name": clean_name,
"type": "company" if is_company else "person",
"original": value,
}
def _normalize_address(value: str) -> dict[str, Any]:
"""Normalize address"""
import re
if not value:
return {"address": None, "original": value}
clean_address = str(value).strip()
# Extract UK postcode
postcode_pattern = r"\b[A-Z]{1,2}\d[A-Z0-9]?\s*\d[A-Z]{2}\b"
postcode_match = re.search(postcode_pattern, clean_address, re.IGNORECASE)
postcode = postcode_match.group().upper() if postcode_match else None
return {"address": clean_address, "postcode": postcode, "original": value}
def _normalize_number(value: str) -> dict[str, Any]:
"""Normalize reference numbers"""
import re
if not value:
return {"number": None, "original": value}
# Remove spaces and special characters
clean_number = re.sub(r"[^\w]", "", str(value))
# Detect number type
number_type = "unknown"
if len(clean_number) == 10 and clean_number.isdigit():
number_type = "utr" # UTR is 10 digits
elif len(clean_number) == 8 and clean_number.isdigit():
number_type = "account_number"
elif re.match(r"^\d{6}$", clean_number):
number_type = "sort_code"
return {"number": clean_number, "type": number_type, "original": value}
def _normalize_text(value: str) -> dict[str, Any]:
"""Normalize general text"""
if not value:
return {"text": None, "original": value}
clean_text = str(value).strip()
return {"text": clean_text, "original": value}
async def _map_to_entities(
normalized_data: dict[str, Any], doc_id: str, tenant_id: str
) -> list[dict[str, Any]]:
"""Map normalized data to knowledge graph entities"""
entities = []
# Create document entity
doc_entity = {
"type": "Document",
"id": doc_id,
"properties": {
"doc_id": doc_id,
"tenant_id": tenant_id,
"processed_at": datetime.utcnow().isoformat(),
"source": "extraction",
"extractor_version": "1.0.0",
"valid_from": datetime.utcnow(),
"asserted_at": datetime.utcnow(),
},
}
entities.append(doc_entity)
# Map specific field types to entities
for field_name, normalized_value in normalized_data.items():
if isinstance(normalized_value, dict):
if "amount" in normalized_value and normalized_value["amount"] is not None:
# Create expense or income item
entity_type = (
"ExpenseItem" if "expense" in field_name.lower() else "IncomeItem"
)
entity = {
"type": entity_type,
"id": f"{entity_type.lower()}_{ulid.new()}",
"properties": {
"amount": normalized_value["amount"],
"currency": normalized_value["currency"],
"description": field_name,
"source": doc_id,
"extractor_version": "1.0.0",
"valid_from": datetime.utcnow(),
"asserted_at": datetime.utcnow(),
},
}
entities.append(entity)
elif "name" in normalized_value and normalized_value["name"] is not None:
# Create party entity
entity = {
"type": "Party",
"id": f"party_{ulid.new()}",
"properties": {
"name": normalized_value["name"],
"party_type": normalized_value.get("type", "unknown"),
"source": doc_id,
"extractor_version": "1.0.0",
"valid_from": datetime.utcnow(),
"asserted_at": datetime.utcnow(),
},
}
entities.append(entity)
return entities
async def _store_entities(
entities: list[dict[str, Any]], tenant_id: str
) -> list[dict[str, Any]]:
"""Store entities in knowledge graph"""
stored_entities = []
for entity in entities:
try:
# Create node in Neo4j
result = await neo4j_client.create_node(
label=entity["type"], properties=entity["properties"]
)
stored_entities.append(
{
"type": entity["type"],
"id": entity["id"],
"neo4j_id": result.get("id"),
"properties": entity["properties"],
}
)
logger.debug("Entity stored", type=entity["type"], id=entity["id"])
except Exception as e:
logger.error("Failed to store entity", entity=entity, error=str(e))
return stored_entities
@app.exception_handler(HTTPException)
async def http_exception_handler(request: Request, exc: HTTPException) -> JSONResponse:
"""Handle HTTP exceptions with RFC7807 format"""
return JSONResponse(
status_code=exc.status_code,
content=ErrorResponse(
type=f"https://httpstatuses.com/{exc.status_code}",
title=exc.detail,
status=exc.status_code,
detail=exc.detail,
instance=str(request.url),
trace_id="",
).dict(),
)
if __name__ == "__main__":
import uvicorn
uvicorn.run("main:app", host="0.0.0.0", port=8004, reload=True, log_config=None)

View File

@@ -0,0 +1,37 @@
# FastAPI and server
fastapi>=0.104.1
uvicorn[standard]>=0.24.0
pydantic>=2.5.0
# Service-specific dependencies
# Data normalization and cleaning
pandas>=2.1.0
numpy>=1.24.0
# Currency and exchange rates
forex-python>=1.8
babel>=2.13.0
# Date and time processing
python-dateutil>=2.8.0
pytz>=2023.3
# Text normalization
unidecode>=1.3.0
phonenumbers>=8.13.0
# Entity resolution and matching
recordlinkage>=0.16.0
fuzzywuzzy>=0.18.0
python-Levenshtein>=0.23.0
# Geographic data
geopy>=2.4.0
pycountry>=23.12.0
# Data validation
cerberus>=1.3.4
marshmallow>=3.20.0
# UK-specific utilities
uk-postcode-utils>=1.0.0

43
apps/svc_ocr/Dockerfile Normal file
View File

@@ -0,0 +1,43 @@
# Dockerfile for svc_ocr - Uses base-ml image
# Base image contains: FastAPI, database drivers, transformers, PyTorch, numpy, etc.
# This Dockerfile adds OCR-specific dependencies and application code
ARG REGISTRY=gitea.harkon.co.uk
ARG OWNER=harkon
ARG BASE_VERSION=v1.0.1
FROM ${REGISTRY}/${OWNER}/base-ml:${BASE_VERSION}
# Switch to root to install system and service-specific dependencies
USER root
# Install OCR runtime dependencies (Tesseract, poppler)
RUN apt-get update && apt-get install -y \
tesseract-ocr \
tesseract-ocr-eng \
poppler-utils \
&& rm -rf /var/lib/apt/lists/*
# Set working directory
WORKDIR /app
# Copy service-specific requirements and install
COPY apps/svc_ocr/requirements.txt /tmp/service-requirements.txt
RUN pip install --no-cache-dir -r /tmp/service-requirements.txt
# Copy application code
COPY libs/ ./libs/
COPY apps/svc_ocr/ ./apps/svc_ocr/
# Set permissions and switch to non-root user
RUN chown -R appuser:appuser /app
USER appuser
# Health check
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
CMD curl -f http://localhost:8000/healthz || exit 1
# Expose port
EXPOSE 8000
# Run the application
CMD ["python", "-m", "uvicorn", "apps.svc_ocr.main:app", "--host", "0.0.0.0", "--port", "8000"]

504
apps/svc_ocr/main.py Normal file
View File

@@ -0,0 +1,504 @@
# FILE: apps/svc-ocr/main.py
# OCR and layout extraction using Tesseract, LayoutLM, and document AI
import os
# Import shared libraries
import sys
from datetime import datetime
from typing import Any
import structlog
import ulid
from fastapi import BackgroundTasks, Depends, HTTPException, Request
from fastapi.responses import JSONResponse
sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
from libs.app_factory import create_app
from libs.config import BaseAppSettings, create_event_bus, create_minio_client
from libs.events import EventBus, EventPayload, EventTopics
from libs.observability import get_metrics, get_tracer, setup_observability
from libs.schemas import ErrorResponse
from libs.security import get_current_user, get_tenant_id
from libs.storage import DocumentStorage, StorageClient
logger = structlog.get_logger()
class OCRSettings(BaseAppSettings):
"""Settings for OCR service"""
service_name: str = "svc-ocr"
# OCR configuration
tesseract_cmd: str = "/usr/bin/tesseract"
tesseract_config: str = "--oem 3 --psm 6"
languages: str = "eng"
# Layout analysis
layoutlm_model: str = "microsoft/layoutlm-base-uncased"
confidence_threshold: float = 0.7
# Processing limits
max_pages: int = 50
max_file_size: int = 100 * 1024 * 1024 # 100MB
# Output configuration
include_coordinates: bool = True
include_confidence: bool = True
# Create app and settings
app, settings = create_app(
service_name="svc-ocr",
title="Tax Agent OCR Service",
description="OCR and layout extraction service",
settings_class=OCRSettings,
) # fmt: skip
# Global clients
storage_client: StorageClient | None = None
document_storage: DocumentStorage | None = None
event_bus: EventBus | None = None
tracer = get_tracer("svc-ocr")
metrics = get_metrics()
@app.on_event("startup")
async def startup_event() -> None:
"""Initialize service dependencies"""
global storage_client, document_storage, event_bus
logger.info("Starting OCR service")
# Setup observability
setup_observability(settings)
# Initialize MinIO client
minio_client = create_minio_client(settings)
storage_client = StorageClient(minio_client)
document_storage = DocumentStorage(storage_client)
# Initialize event bus
event_bus = create_event_bus(settings)
if not event_bus:
raise HTTPException(status_code=500, detail="Event bus not initialized")
await event_bus.start()
# Subscribe to document ingestion events
await event_bus.subscribe(EventTopics.DOC_INGESTED, _handle_document_ingested)
logger.info("OCR service started successfully")
@app.on_event("shutdown")
async def shutdown_event() -> None:
"""Cleanup service dependencies"""
global event_bus
logger.info("Shutting down OCR service")
if event_bus:
await event_bus.stop()
logger.info("OCR service shutdown complete")
@app.get("/health")
async def health_check() -> dict[str, Any]:
"""Health check endpoint"""
return {
"status": "healthy",
"service": settings.service_name,
"version": settings.service_version,
"timestamp": datetime.utcnow().isoformat(),
}
@app.post("/process/{doc_id}")
async def process_document(
doc_id: str,
background_tasks: BackgroundTasks,
strategy: str = "hybrid",
current_user: dict[str, Any] = Depends(get_current_user),
tenant_id: str = Depends(get_tenant_id),
) -> dict[str, Any]:
"""Process document with OCR"""
with tracer.start_as_current_span("process_document") as span:
span.set_attribute("doc_id", doc_id)
span.set_attribute("tenant_id", tenant_id)
span.set_attribute("strategy", strategy)
try:
# Check if document exists
doc_content = await document_storage.get_document(tenant_id, doc_id)
if not doc_content:
raise HTTPException(status_code=404, detail="Document not found")
# Generate processing ID
processing_id = str(ulid.new())
span.set_attribute("processing_id", processing_id)
# Start background processing
background_tasks.add_task(
_process_document_async,
doc_id,
tenant_id,
doc_content,
strategy,
processing_id,
current_user.get("sub", "system"),
)
logger.info(
"OCR processing started", doc_id=doc_id, processing_id=processing_id
)
return {
"processing_id": processing_id,
"doc_id": doc_id,
"status": "processing",
"strategy": strategy,
}
except HTTPException:
raise
except Exception as e:
logger.error("Failed to start OCR processing", doc_id=doc_id, error=str(e))
raise HTTPException(status_code=500, detail="Failed to start processing")
@app.get("/results/{doc_id}")
async def get_ocr_results(
doc_id: str,
current_user: dict[str, Any] = Depends(get_current_user),
tenant_id: str = Depends(get_tenant_id),
) -> dict[str, Any]:
"""Get OCR results for document"""
with tracer.start_as_current_span("get_ocr_results") as span:
span.set_attribute("doc_id", doc_id)
span.set_attribute("tenant_id", tenant_id)
try:
# Get OCR results from storage
ocr_results = await document_storage.get_ocr_result(tenant_id, doc_id)
if not ocr_results:
raise HTTPException(status_code=404, detail="OCR results not found")
return ocr_results
except HTTPException:
raise
except Exception as e:
logger.error("Failed to get OCR results", doc_id=doc_id, error=str(e))
raise HTTPException(status_code=500, detail="Failed to get OCR results")
async def _handle_document_ingested(topic: str, payload: EventPayload) -> None:
"""Handle document ingestion events"""
try:
data = payload.data
doc_id = data.get("doc_id")
tenant_id = data.get("tenant_id")
if not doc_id or not tenant_id:
logger.warning("Invalid document ingestion event", data=data)
return
# Auto-process PDF documents
if data.get("content_type") == "application/pdf":
logger.info("Auto-processing ingested document", doc_id=doc_id)
# Get document content
doc_content = await document_storage.get_document(tenant_id, doc_id)
if doc_content:
await _process_document_async(
doc_id=doc_id,
tenant_id=tenant_id,
content=doc_content,
strategy="hybrid",
processing_id=str(ulid.new()),
actor=payload.actor,
)
except Exception as e:
logger.error("Failed to handle document ingestion", error=str(e))
async def _process_document_async(
doc_id: str,
tenant_id: str,
content: bytes,
strategy: str,
processing_id: str,
actor: str,
) -> None:
"""Process document asynchronously"""
with tracer.start_as_current_span("process_document_async") as span:
span.set_attribute("doc_id", doc_id)
span.set_attribute("processing_id", processing_id)
span.set_attribute("strategy", strategy)
try:
# Convert PDF to images
images = await _pdf_to_images(content)
# Process each page
pages_data: list[Any] = []
for page_num, image in enumerate(images, 1):
page_data = await _process_page(image, page_num, strategy)
pages_data.append(page_data)
# Combine results
ocr_results = {
"doc_id": doc_id,
"processing_id": processing_id,
"strategy": strategy,
"processed_at": datetime.utcnow().isoformat(),
"total_pages": len(pages_data),
"pages": pages_data,
"metadata": {
"confidence_threshold": settings.confidence_threshold,
"languages": settings.languages,
},
}
# Store results
await document_storage.store_ocr_result(tenant_id, doc_id, ocr_results)
# Update metrics
metrics.counter("documents_processed_total").labels(
tenant_id=tenant_id, strategy=strategy
).inc()
metrics.histogram("processing_duration_seconds").labels(
strategy=strategy
).observe(
datetime.utcnow().timestamp()
- datetime.fromisoformat(
ocr_results["processed_at"].replace("Z", "")
).timestamp()
)
# Publish completion event
event_payload = EventPayload(
data={
"doc_id": doc_id,
"tenant_id": tenant_id,
"processing_id": processing_id,
"strategy": strategy,
"total_pages": len(pages_data),
"ocr_results": ocr_results,
},
actor=actor,
tenant_id=tenant_id,
)
await event_bus.publish(EventTopics.DOC_OCR_READY, event_payload)
logger.info(
"OCR processing completed", doc_id=doc_id, pages=len(pages_data)
)
except Exception as e:
logger.error("OCR processing failed", doc_id=doc_id, error=str(e))
# Update error metrics
metrics.counter("processing_errors_total").labels(
tenant_id=tenant_id, strategy=strategy, error_type=type(e).__name__
).inc()
async def _pdf_to_images(pdf_content: bytes) -> list[bytes]:
"""Convert PDF to images"""
try:
import fitz # PyMuPDF
# Open PDF
pdf_doc = fitz.open(stream=pdf_content, filetype="pdf")
images: list[Any] = []
for page_num in range(min(len(pdf_doc), settings.max_pages)):
page = pdf_doc[page_num]
# Render page to image
mat = fitz.Matrix(2.0, 2.0) # 2x zoom for better OCR
pix = page.get_pixmap(matrix=mat)
img_data = pix.tobytes("png")
images.append(img_data)
pdf_doc.close()
return images
except ImportError:
logger.error("PyMuPDF not available, using fallback")
return await _pdf_to_images_fallback(pdf_content)
except Exception as e:
logger.error("PDF conversion failed", error=str(e))
raise
async def _pdf_to_images_fallback(pdf_content: bytes) -> list[bytes]:
"""Fallback PDF to images conversion"""
try:
from pdf2image import convert_from_bytes
images = convert_from_bytes(
pdf_content, dpi=200, first_page=1, last_page=settings.max_pages
)
# Convert PIL images to bytes
image_bytes: list[Any] = []
for img in images:
import io
img_buffer = io.BytesIO()
img.save(img_buffer, format="PNG")
image_bytes.append(img_buffer.getvalue())
return image_bytes
except ImportError:
logger.error("pdf2image not available")
raise Exception("No PDF conversion library available")
async def _process_page(
image_data: bytes, page_num: int, strategy: str
) -> dict[str, Any]:
"""Process single page with OCR"""
if strategy == "tesseract":
return await _process_with_tesseract(image_data, page_num)
elif strategy == "layoutlm":
return await _process_with_layoutlm(image_data, page_num)
elif strategy == "hybrid":
# Combine both approaches
tesseract_result = await _process_with_tesseract(image_data, page_num)
layoutlm_result = await _process_with_layoutlm(image_data, page_num)
return {
"page": page_num,
"strategy": "hybrid",
"tesseract": tesseract_result,
"layoutlm": layoutlm_result,
"text": tesseract_result.get("text", ""),
"confidence": max(
tesseract_result.get("confidence", 0),
layoutlm_result.get("confidence", 0),
),
}
else:
raise ValueError(f"Unknown strategy: {strategy}")
async def _process_with_tesseract(image_data: bytes, page_num: int) -> dict[str, Any]:
"""Process page with Tesseract OCR"""
try:
import io
import pytesseract
from PIL import Image
# Load image
image = Image.open(io.BytesIO(image_data))
# Configure Tesseract
config = f"{settings.tesseract_config} -l {settings.languages}"
# Extract text with confidence
data = pytesseract.image_to_data(
image, config=config, output_type=pytesseract.Output.DICT
)
# Process results
words: list[Any] = []
confidences: list[Any] = []
for i in range(len(data["text"])):
if int(data["conf"][i]) > 0: # Valid confidence
word_data = {
"text": data["text"][i],
"confidence": int(data["conf"][i]) / 100.0,
"bbox": [
data["left"][i],
data["top"][i],
data["left"][i] + data["width"][i],
data["top"][i] + data["height"][i],
],
}
words.append(word_data)
confidences.append(word_data["confidence"])
# Extract full text
full_text = pytesseract.image_to_string(image, config=config)
return {
"page": page_num,
"strategy": "tesseract",
"text": full_text.strip(),
"words": words,
"confidence": sum(confidences) / len(confidences) if confidences else 0.0,
"word_count": len(words),
}
except ImportError:
logger.error("pytesseract not available")
return {
"page": page_num,
"strategy": "tesseract",
"error": "pytesseract not available",
}
except Exception as e:
logger.error("Tesseract processing failed", page=page_num, error=str(e))
return {"page": page_num, "strategy": "tesseract", "error": str(e)}
async def _process_with_layoutlm(image_data: bytes, page_num: int) -> dict[str, Any]:
"""Process page with LayoutLM"""
try:
# This would integrate with LayoutLM model
# For now, return placeholder
logger.warning("LayoutLM processing not implemented")
return {
"page": page_num,
"strategy": "layoutlm",
"text": "",
"layout_elements": [],
"confidence": 0.0,
"error": "Not implemented",
}
except Exception as e:
logger.error("LayoutLM processing failed", page=page_num, error=str(e))
return {"page": page_num, "strategy": "layoutlm", "error": str(e)}
@app.exception_handler(HTTPException)
async def http_exception_handler(request: Request, exc: HTTPException) -> JSONResponse:
"""Handle HTTP exceptions with RFC7807 format"""
return JSONResponse(
status_code=exc.status_code,
content=ErrorResponse(
type=f"https://httpstatuses.com/{exc.status_code}",
title=exc.detail,
status=exc.status_code,
detail=exc.detail,
instance=str(request.url),
trace_id="",
).model_dump(),
)
if __name__ == "__main__":
import uvicorn
uvicorn.run("main:app", host="0.0.0.0", port=8002, reload=True, log_config=None)

View File

@@ -0,0 +1,16 @@
# Service-specific dependencies for svc_ocr
# NOTE: ML dependencies (transformers, torch, numpy) are in base-ml image
# OCR engines (lightweight)
pytesseract>=0.3.13
# PDF processing
PyMuPDF>=1.26.4
pdf2image>=1.17.0
# Image processing
Pillow>=11.3.0
opencv-python-headless>=4.12.0.88 # Headless version is smaller
# Computer vision (torchvision not in base-ml)
torchvision>=0.23.0

View File

@@ -0,0 +1,36 @@
# Dockerfile for svc_rag_indexer - Uses base-ml image
# Base image contains: FastAPI, database drivers, sentence-transformers, PyTorch, numpy, etc.
# This Dockerfile only adds service-specific dependencies and application code
ARG REGISTRY=gitea.harkon.co.uk
ARG OWNER=harkon
ARG BASE_VERSION=v1.0.1
FROM ${REGISTRY}/${OWNER}/base-ml:${BASE_VERSION}
# Switch to root to install service-specific dependencies
USER root
# Set working directory
WORKDIR /app
# Copy service-specific requirements and install
COPY apps/svc_rag_indexer/requirements.txt /tmp/service-requirements.txt
RUN pip install --no-cache-dir -r /tmp/service-requirements.txt
# Copy application code
COPY libs/ ./libs/
COPY apps/svc_rag_indexer/ ./apps/svc_rag_indexer/
# Set permissions and switch to non-root user
RUN chown -R appuser:appuser /app
USER appuser
# Health check
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
CMD curl -f http://localhost:8000/healthz || exit 1
# Expose port
EXPOSE 8000
# Run the application
CMD ["python", "-m", "uvicorn", "apps.svc_rag_indexer.main:app", "--host", "0.0.0.0", "--port", "8000"]

View File

@@ -0,0 +1,535 @@
# FILE: apps/svc-rag-indexer/main.py
# mypy: disable-error-code=union-attr
# Vector database indexing with PII protection and de-identification
import os
# Import shared libraries
import sys
from datetime import datetime
from typing import Any
import structlog
import ulid
from fastapi import BackgroundTasks, Depends, HTTPException, Request
from fastapi.responses import JSONResponse
sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
from libs.app_factory import create_app
from libs.config import BaseAppSettings, create_event_bus, create_qdrant_client
from libs.events import EventBus, EventPayload, EventTopics
from libs.observability import get_metrics, get_tracer, setup_observability
from libs.rag import PIIDetector, QdrantCollectionManager
from libs.schemas import ErrorResponse
from libs.security import get_current_user, get_tenant_id
logger = structlog.get_logger()
class RAGIndexerSettings(BaseAppSettings):
"""Settings for RAG indexer service"""
service_name: str = "svc-rag-indexer"
# Embedding configuration
embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2"
embedding_dimension: int = 384
# Chunking configuration
chunk_size: int = 512
chunk_overlap: int = 50
# Collection configuration
collections: dict[str, str] = {
"documents": "Document chunks with metadata",
"tax_rules": "Tax rules and regulations",
"case_law": "Tax case law and precedents",
"guidance": "HMRC guidance and manuals",
}
# PII protection
require_pii_free: bool = True
auto_deidentify: bool = True
# Create app and settings
app, settings = create_app(
service_name="svc-rag-indexer",
title="Tax Agent RAG Indexer Service",
description="Vector database indexing with PII protection",
settings_class=RAGIndexerSettings,
)
# Global clients
qdrant_client = None
collection_manager: QdrantCollectionManager | None = None
pii_detector: PIIDetector | None = None
event_bus: EventBus | None = None
embedding_model = None
tracer = get_tracer("svc-rag-indexer")
metrics = get_metrics()
@app.on_event("startup")
async def startup_event() -> None:
"""Initialize service dependencies"""
global qdrant_client, collection_manager, pii_detector, event_bus, embedding_model
logger.info("Starting RAG indexer service")
# Setup observability
setup_observability(settings)
# Initialize Qdrant client
qdrant_client = create_qdrant_client(settings)
collection_manager = QdrantCollectionManager(qdrant_client)
# Initialize PII detector
pii_detector = PIIDetector()
# Initialize embedding model
try:
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer(settings.embedding_model)
logger.info("Embedding model loaded", model=settings.embedding_model)
except ImportError:
logger.warning("sentence-transformers not available, using mock embeddings")
embedding_model = None
# Initialize event bus
event_bus = create_event_bus(settings)
await event_bus.start()
# Subscribe to relevant events
await event_bus.subscribe(EventTopics.DOC_EXTRACTED, _handle_document_extracted) # type: ignore
await event_bus.subscribe(EventTopics.KG_UPSERTED, _handle_kg_upserted) # type: ignore
# Ensure collections exist
for collection_name in settings.collections:
await collection_manager.ensure_collection(
collection_name=collection_name, vector_size=settings.embedding_dimension
)
logger.info("RAG indexer service started successfully")
@app.on_event("shutdown")
async def shutdown_event() -> None:
"""Cleanup service dependencies"""
global event_bus
logger.info("Shutting down RAG indexer service")
if event_bus:
await event_bus.stop()
logger.info("RAG indexer service shutdown complete")
@app.get("/health")
async def health_check() -> dict[str, Any]:
"""Health check endpoint"""
return {
"status": "healthy",
"service": settings.service_name,
"version": settings.service_version,
"timestamp": datetime.utcnow().isoformat(),
"collections": list(settings.collections.keys()),
}
@app.post("/index/{collection_name}")
async def index_document(
collection_name: str,
document: dict[str, Any],
background_tasks: BackgroundTasks,
current_user: dict[str, Any] = Depends(get_current_user),
tenant_id: str = Depends(get_tenant_id),
):
"""Index document in vector database"""
with tracer.start_as_current_span("index_document") as span:
span.set_attribute("collection_name", collection_name)
span.set_attribute("tenant_id", tenant_id)
try:
# Validate collection
if collection_name not in settings.collections:
raise HTTPException(
status_code=400, detail=f"Unknown collection: {collection_name}"
)
# Generate indexing ID
indexing_id = str(ulid.new())
span.set_attribute("indexing_id", indexing_id)
# Start background indexing
background_tasks.add_task(
_index_document_async,
collection_name,
document,
tenant_id,
indexing_id,
current_user.get("sub", "system"),
)
logger.info(
"Document indexing started",
collection=collection_name,
indexing_id=indexing_id,
)
return {
"indexing_id": indexing_id,
"collection": collection_name,
"status": "indexing",
}
except HTTPException:
raise
except Exception as e:
logger.error(
"Failed to start indexing", collection=collection_name, error=str(e)
)
raise HTTPException(status_code=500, detail="Failed to start indexing")
@app.get("/collections")
async def list_collections(
current_user: dict[str, Any] = Depends(get_current_user),
tenant_id: str = Depends(get_tenant_id),
):
"""List available collections"""
try:
collections_info: list[Any] = []
for collection_name, description in settings.collections.items():
# Get collection info from Qdrant
try:
collection_info = qdrant_client.get_collection(collection_name)
point_count = collection_info.points_count
vector_count = collection_info.vectors_count
except Exception:
point_count = 0
vector_count = 0
collections_info.append(
{
"name": collection_name,
"description": description,
"point_count": point_count,
"vector_count": vector_count,
}
)
return {
"collections": collections_info,
"total_collections": len(collections_info),
}
except Exception as e:
logger.error("Failed to list collections", error=str(e))
raise HTTPException(status_code=500, detail="Failed to list collections")
async def _handle_document_extracted(topic: str, payload: EventPayload) -> None:
"""Handle document extraction completion events"""
try:
data = payload.data
doc_id = data.get("doc_id")
tenant_id = data.get("tenant_id")
extraction_results = data.get("extraction_results")
if not doc_id or not tenant_id or not extraction_results:
logger.warning("Invalid document extraction event", data=data)
return
logger.info("Auto-indexing extracted document", doc_id=doc_id)
# Create document for indexing
document = {
"doc_id": doc_id,
"content": _extract_content_from_results(extraction_results),
"metadata": {
"doc_id": doc_id,
"tenant_id": tenant_id,
"extraction_id": extraction_results.get("extraction_id"),
"confidence": extraction_results.get("confidence", 0.0),
"extracted_at": extraction_results.get("extracted_at"),
"source": "extraction",
},
}
await _index_document_async(
collection_name="documents",
document=document,
tenant_id=tenant_id,
indexing_id=str(ulid.new()),
actor=payload.actor,
)
except Exception as e:
logger.error("Failed to handle document extraction event", error=str(e))
async def _handle_kg_upserted(topic: str, payload: EventPayload) -> None:
"""Handle knowledge graph upsert events"""
try:
data = payload.data
entities = data.get("entities", [])
tenant_id = data.get("tenant_id")
if not entities or not tenant_id:
logger.warning("Invalid KG upsert event", data=data)
return
logger.info("Auto-indexing KG entities", count=len(entities))
# Index entities as documents
for entity in entities:
document = {
"entity_id": entity.get("id"),
"content": _extract_content_from_entity(entity),
"metadata": {
"entity_type": entity.get("type"),
"entity_id": entity.get("id"),
"tenant_id": tenant_id,
"source": "knowledge_graph",
},
}
await _index_document_async(
collection_name="documents",
document=document,
tenant_id=tenant_id,
indexing_id=str(ulid.new()),
actor=payload.actor,
)
except Exception as e:
logger.error("Failed to handle KG upsert event", error=str(e))
async def _index_document_async(
collection_name: str,
document: dict[str, Any],
tenant_id: str,
indexing_id: str,
actor: str,
):
"""Index document asynchronously"""
with tracer.start_as_current_span("index_document_async") as span:
span.set_attribute("collection_name", collection_name)
span.set_attribute("indexing_id", indexing_id)
span.set_attribute("tenant_id", tenant_id)
try:
content = document.get("content", "")
metadata = document.get("metadata", {})
# Check for PII and de-identify if needed
if settings.require_pii_free:
has_pii = pii_detector.has_pii(content)
if has_pii:
if settings.auto_deidentify:
content, pii_mapping = pii_detector.de_identify_text(content)
metadata["pii_removed"] = True
metadata["pii_mapping_hash"] = _hash_pii_mapping(pii_mapping)
logger.info("PII removed from content", indexing_id=indexing_id)
else:
logger.warning(
"Content contains PII, skipping indexing",
indexing_id=indexing_id,
)
return
# Mark as PII-free
metadata["pii_free"] = True
metadata["tenant_id"] = tenant_id
metadata["indexed_at"] = datetime.utcnow().isoformat()
# Chunk content
chunks = _chunk_text(content)
# Generate embeddings and index chunks
indexed_chunks = 0
for i, chunk in enumerate(chunks):
try:
# Generate embedding
embedding = await _generate_embedding(chunk)
# Create point
point_id = f"{indexing_id}_{i}"
from qdrant_client.models import PointStruct
point = PointStruct(
id=point_id,
vector=embedding,
payload={
**metadata,
"chunk_text": chunk,
"chunk_index": i,
"total_chunks": len(chunks),
},
)
# Index point
success = await collection_manager.upsert_points(
collection_name, [point]
)
if success:
indexed_chunks += 1
except Exception as e:
logger.error("Failed to index chunk", chunk_index=i, error=str(e))
# Update metrics
metrics.counter("documents_indexed_total").labels(
tenant_id=tenant_id, collection=collection_name
).inc()
metrics.histogram("chunks_per_document").labels(
collection=collection_name
).observe(indexed_chunks)
# Publish completion event
event_payload = EventPayload(
data={
"indexing_id": indexing_id,
"collection": collection_name,
"tenant_id": tenant_id,
"chunks_indexed": indexed_chunks,
"total_chunks": len(chunks),
},
actor=actor,
tenant_id=tenant_id,
)
await event_bus.publish(EventTopics.RAG_INDEXED, event_payload)
logger.info(
"Document indexing completed",
indexing_id=indexing_id,
chunks=indexed_chunks,
)
except Exception as e:
logger.error(
"Document indexing failed", indexing_id=indexing_id, error=str(e)
)
# Update error metrics
metrics.counter("indexing_errors_total").labels(
tenant_id=tenant_id,
collection=collection_name,
error_type=type(e).__name__,
).inc()
def _extract_content_from_results(extraction_results: dict[str, Any]) -> str:
"""Extract text content from extraction results"""
content_parts: list[Any] = []
# Add extracted fields
extracted_fields = extraction_results.get("extracted_fields", {})
for field_name, field_value in extracted_fields.items():
content_parts.append(f"{field_name}: {field_value}")
return "\n".join(content_parts)
def _extract_content_from_entity(entity: dict[str, Any]) -> str:
"""Extract text content from KG entity"""
content_parts: list[Any] = []
# Add entity type and ID
entity_type = entity.get("type", "Unknown")
entity_id = entity.get("id", "")
content_parts.append(f"Entity Type: {entity_type}")
content_parts.append(f"Entity ID: {entity_id}")
# Add properties
properties = entity.get("properties", {})
for prop_name, prop_value in properties.items():
if prop_name not in ["tenant_id", "asserted_at", "retracted_at"]:
content_parts.append(f"{prop_name}: {prop_value}")
return "\n".join(content_parts)
def _chunk_text(text: str) -> list[str]:
"""Chunk text into smaller pieces"""
if not text:
return []
# Simple chunking by sentences/paragraphs
chunks: list[Any] = []
current_chunk = ""
sentences = text.split(". ")
for sentence in sentences:
if len(current_chunk) + len(sentence) < settings.chunk_size:
current_chunk += sentence + ". "
else:
if current_chunk:
chunks.append(current_chunk.strip())
current_chunk = sentence + ". "
if current_chunk:
chunks.append(current_chunk.strip())
return chunks
async def _generate_embedding(text: str) -> list[float]:
"""Generate embedding for text"""
if embedding_model:
try:
embedding = embedding_model.encode(text)
return embedding.tolist()
except Exception as e:
logger.error("Failed to generate embedding", error=str(e))
# Fallback: random embedding
import random
return [random.random() for _ in range(settings.embedding_dimension)]
def _hash_pii_mapping(pii_mapping: dict[str, str]) -> str:
"""Create hash of PII mapping for audit purposes"""
import hashlib
import json
mapping_json = json.dumps(pii_mapping, sort_keys=True)
return hashlib.sha256(mapping_json.encode()).hexdigest()
@app.exception_handler(HTTPException)
async def http_exception_handler(request: Request, exc: HTTPException) -> JSONResponse:
"""Handle HTTP exceptions with RFC7807 format"""
return JSONResponse(
status_code=exc.status_code,
content=ErrorResponse(
type=f"https://httpstatuses.com/{exc.status_code}",
title=exc.detail,
status=exc.status_code,
detail=exc.detail,
instance=str(request.url),
trace_id="",
).model_dump(),
)
if __name__ == "__main__":
import uvicorn
uvicorn.run("main:app", host="0.0.0.0", port=8006, reload=True, log_config=None)

View File

@@ -0,0 +1,19 @@
# Service-specific dependencies for svc_rag_indexer
# NOTE: ML dependencies (sentence-transformers, transformers, torch, numpy) are in base-ml image
# Text chunking (lightweight alternative to langchain)
tiktoken>=0.11.0
# Text preprocessing (lightweight)
beautifulsoup4>=4.14.2
# Text similarity (CPU-only)
faiss-cpu>=1.12.0
# Document processing (lightweight)
python-docx>=1.2.0
python-pptx>=1.0.2
openpyxl>=3.1.5
# Sparse vector processing
sparse-dot-topn>=1.1.5

View File

@@ -0,0 +1,36 @@
# Dockerfile for svc_rag_retriever - Uses base-ml image
# Base image contains: FastAPI, database drivers, sentence-transformers, PyTorch, etc.
# This Dockerfile only adds service-specific dependencies and application code
ARG REGISTRY=gitea.harkon.co.uk
ARG OWNER=harkon
ARG BASE_VERSION=v1.0.1
FROM ${REGISTRY}/${OWNER}/base-ml:${BASE_VERSION}
# Switch to root to install service-specific dependencies
USER root
# Set working directory
WORKDIR /app
# Copy service-specific requirements and install
COPY apps/svc_rag_retriever/requirements.txt /tmp/service-requirements.txt
RUN pip install --no-cache-dir -r /tmp/service-requirements.txt
# Copy application code
COPY libs/ ./libs/
COPY apps/svc_rag_retriever/ ./apps/svc_rag_retriever/
# Set permissions and switch to non-root user
RUN chown -R appuser:appuser /app
USER appuser
# Health check
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
CMD curl -f http://localhost:8000/healthz || exit 1
# Expose port
EXPOSE 8000
# Run the application
CMD ["python", "-m", "uvicorn", "apps.svc_rag_retriever.main:app", "--host", "0.0.0.0", "--port", "8000"]

View File

@@ -0,0 +1,476 @@
# FILE: apps/svc-rag-retriever/main.py
# mypy: disable-error-code=union-attr
# Hybrid search with KG fusion, reranking, and calibrated confidence
import os
# Import shared libraries
import sys
from datetime import datetime
from typing import Any
import structlog
from fastapi import Depends, HTTPException, Query, Request
from fastapi.responses import JSONResponse
from qdrant_client.models import SparseVector
sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
from libs.app_factory import create_app
from libs.calibration import ConfidenceCalibrator
from libs.config import (
BaseAppSettings,
create_event_bus,
create_neo4j_client,
create_qdrant_client,
)
from libs.events import EventBus
from libs.neo import Neo4jClient
from libs.observability import get_metrics, get_tracer, setup_observability
from libs.rag import RAGRetriever
from libs.schemas import ErrorResponse, RAGSearchRequest, RAGSearchResponse
from libs.security import get_current_user, get_tenant_id
logger = structlog.get_logger()
class RAGRetrieverSettings(BaseAppSettings):
"""Settings for RAG retriever service"""
service_name: str = "svc-rag-retriever"
# Embedding configuration
embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2"
embedding_dimension: int = 384
# Search configuration
default_k: int = 10
max_k: int = 100
alpha: float = 0.5 # Dense/sparse balance
beta: float = 0.3 # Vector/KG balance
gamma: float = 0.2 # Reranking weight
# Collections to search
search_collections: list[str] = ["documents", "tax_rules", "guidance"]
# Reranking
reranker_model: str | None = None
rerank_top_k: int = 50
# Create app and settings
app, settings = create_app(
service_name="svc-rag-retriever",
title="Tax Agent RAG Retriever Service",
description="Hybrid search with KG fusion and reranking",
settings_class=RAGRetrieverSettings,
)
# Global clients
qdrant_client = None
neo4j_client: Neo4jClient | None = None
rag_retriever: RAGRetriever | None = None
event_bus: EventBus | None = None
embedding_model = None
confidence_calibrator: ConfidenceCalibrator | None = None
tracer = get_tracer("svc-rag-retriever")
metrics = get_metrics()
@app.on_event("startup")
async def startup_event() -> None:
"""Initialize service dependencies"""
global qdrant_client, neo4j_client, rag_retriever, event_bus, embedding_model, confidence_calibrator
logger.info("Starting RAG retriever service")
# Setup observability
setup_observability(settings)
# Initialize Qdrant client
qdrant_client = create_qdrant_client(settings)
# Initialize Neo4j client
neo4j_driver = create_neo4j_client(settings)
neo4j_client = Neo4jClient(neo4j_driver)
# Initialize RAG retriever
rag_retriever = RAGRetriever(
qdrant_client=qdrant_client,
neo4j_client=neo4j_client,
reranker_model=settings.reranker_model,
)
# Initialize embedding model
try:
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer(settings.embedding_model)
logger.info("Embedding model loaded", model=settings.embedding_model)
except ImportError:
logger.warning("sentence-transformers not available, using mock embeddings")
embedding_model = None
# Initialize confidence calibrator
confidence_calibrator = ConfidenceCalibrator(method="isotonic")
# Initialize event bus
event_bus = create_event_bus(settings)
await event_bus.start() # fmt: skip # pyright: ignore[reportOptionalMemberAccess]
logger.info("RAG retriever service started successfully")
@app.on_event("shutdown")
async def shutdown_event() -> None:
"""Cleanup service dependencies"""
global neo4j_client, event_bus
logger.info("Shutting down RAG retriever service")
if neo4j_client:
await neo4j_client.close()
if event_bus:
await event_bus.stop()
logger.info("RAG retriever service shutdown complete")
@app.get("/health")
async def health_check() -> dict[str, Any]:
"""Health check endpoint"""
return {
"status": "healthy",
"service": settings.service_name,
"version": settings.service_version,
"timestamp": datetime.utcnow().isoformat(),
"search_collections": settings.search_collections,
}
@app.post("/search", response_model=RAGSearchResponse)
async def search(
request_data: RAGSearchRequest,
current_user: dict[str, Any] = Depends(get_current_user),
tenant_id: str = Depends(get_tenant_id),
) -> RAGSearchResponse:
"""Perform hybrid RAG search"""
with tracer.start_as_current_span("rag_search") as span:
span.set_attribute("query", request_data.query[:100])
span.set_attribute("tenant_id", tenant_id)
span.set_attribute("k", request_data.k)
try:
# Generate embeddings for query
dense_vector = await _generate_embedding(request_data.query)
sparse_vector = await _generate_sparse_vector(request_data.query)
# Perform search
search_results = await rag_retriever.search( # fmt: skip # pyright: ignore[reportOptionalMemberAccess]
query=request_data.query,
collections=settings.search_collections,
dense_vector=dense_vector,
sparse_vector=sparse_vector,
k=request_data.k,
alpha=settings.alpha,
beta=settings.beta,
gamma=settings.gamma,
tax_year=request_data.tax_year,
jurisdiction=request_data.jurisdiction,
)
# Update metrics
metrics.counter("searches_total").labels(tenant_id=tenant_id).inc()
metrics.histogram("search_results_count").labels(
tenant_id=tenant_id
).observe(len(search_results["chunks"]))
metrics.histogram("search_confidence").labels(tenant_id=tenant_id).observe(
search_results["calibrated_confidence"]
)
logger.info(
"RAG search completed",
query=request_data.query[:50],
results=len(search_results["chunks"]),
confidence=search_results["calibrated_confidence"],
)
return RAGSearchResponse(
chunks=search_results["chunks"],
citations=search_results["citations"],
kg_hints=search_results["kg_hints"],
calibrated_confidence=search_results["calibrated_confidence"],
)
except Exception as e:
logger.error(
"RAG search failed", query=request_data.query[:50], error=str(e)
)
# Update error metrics
metrics.counter("search_errors_total").labels(
tenant_id=tenant_id, error_type=type(e).__name__
).inc()
raise HTTPException(status_code=500, detail=f"Search failed: {str(e)}")
@app.get("/similar/{doc_id}")
async def find_similar_documents(
doc_id: str,
k: int = Query(default=10, le=settings.max_k),
current_user: dict[str, Any] = Depends(get_current_user),
tenant_id: str = Depends(get_tenant_id),
) -> dict[str, Any]:
"""Find documents similar to given document"""
with tracer.start_as_current_span("find_similar") as span:
span.set_attribute("doc_id", doc_id)
span.set_attribute("tenant_id", tenant_id)
span.set_attribute("k", k)
try:
# Get document content from vector database
# This would search for the document by doc_id in metadata
from qdrant_client.models import FieldCondition, Filter, MatchValue
filter_conditions = Filter(
must=[
FieldCondition(key="doc_id", match=MatchValue(value=doc_id)),
FieldCondition(key="tenant_id", match=MatchValue(value=tenant_id)),
]
)
# Search for the document
doc_results = await rag_retriever.collection_manager.search_dense( # fmt: skip # pyright: ignore[reportOptionalMemberAccess]
collection_name="documents",
query_vector=[0.0] * settings.embedding_dimension, # Dummy vector
limit=1,
filter_conditions=filter_conditions,
)
if not doc_results:
raise HTTPException(status_code=404, detail="Document not found")
# Get the document's vector and use it for similarity search
doc_vector = doc_results[0]["payload"].get("vector")
if not doc_vector:
raise HTTPException(status_code=400, detail="Document has no vector")
# Find similar documents
similar_results = await rag_retriever.collection_manager.search_dense( # fmt: skip # pyright: ignore[reportOptionalMemberAccess]
collection_name="documents",
query_vector=doc_vector,
limit=k + 1, # +1 to exclude the original document
filter_conditions=Filter(
must=[
FieldCondition(
key="tenant_id", match=MatchValue(value=tenant_id)
)
],
must_not=[
FieldCondition(key="doc_id", match=MatchValue(value=doc_id))
],
),
)
return {
"doc_id": doc_id,
"similar_documents": similar_results[:k],
"count": len(similar_results[:k]),
}
except HTTPException:
raise
except Exception as e:
logger.error("Similar document search failed", doc_id=doc_id, error=str(e))
raise HTTPException(
status_code=500, detail=f"Similar search failed: {str(e)}"
)
@app.post("/explain")
async def explain_search(
query: str,
search_results: list[dict[str, Any]],
current_user: dict[str, Any] = Depends(get_current_user),
tenant_id: str = Depends(get_tenant_id),
) -> dict[str, Any]:
"""Explain search results and ranking"""
with tracer.start_as_current_span("explain_search") as span:
span.set_attribute("query", query[:100])
span.set_attribute("tenant_id", tenant_id)
span.set_attribute("results_count", len(search_results))
try:
explanations = []
for i, result in enumerate(search_results):
explanation = {
"rank": i + 1,
"chunk_id": result.get("id"),
"score": result.get("score", 0.0),
"dense_score": result.get("dense_score", 0.0),
"sparse_score": result.get("sparse_score", 0.0),
"collection": result.get("collection"),
"explanation": _generate_explanation(query, result),
}
explanations.append(explanation)
return {
"query": query,
"explanations": explanations,
"ranking_factors": {
"alpha": settings.alpha,
"beta": settings.beta,
"gamma": settings.gamma,
},
}
except Exception as e:
logger.error("Search explanation failed", error=str(e))
raise HTTPException(status_code=500, detail=f"Explanation failed: {str(e)}")
async def _generate_embedding(text: str) -> list[float]:
"""Generate dense embedding for text"""
if embedding_model:
try:
embedding = embedding_model.encode(text)
return embedding.tolist()
except Exception as e:
logger.error("Failed to generate embedding", error=str(e))
# Fallback: random embedding
import random
return [random.random() for _ in range(settings.embedding_dimension)]
async def _generate_sparse_vector(text: str) -> SparseVector:
"""Generate sparse vector for text (BM25-style)"""
try:
# This would use a proper sparse encoder like SPLADE
# For now, create a simple sparse representation
from qdrant_client.models import SparseVector
# Simple word-based sparse vector
words = text.lower().split()
word_counts: dict[str, int] = {}
for word in words:
word_counts[word] = word_counts.get(word, 0) + 1
# Convert to sparse vector format
indices = []
values = []
for _i, (word, count) in enumerate(word_counts.items()):
# Use hash of word as index
word_hash = hash(word) % 10000 # Limit vocabulary size
indices.append(word_hash)
values.append(float(count))
return SparseVector(indices=indices, values=values)
except Exception as e:
logger.error("Failed to generate sparse vector", error=str(e))
# Return empty sparse vector
from qdrant_client.models import SparseVector
return SparseVector(indices=[], values=[])
def _generate_explanation(query: str, result: dict[str, Any]) -> str:
"""Generate human-readable explanation for search result"""
explanations = []
# Score explanation
score = result.get("score", 0.0)
dense_score = result.get("dense_score", 0.0)
sparse_score = result.get("sparse_score", 0.0)
explanations.append(f"Overall score: {score:.3f}")
if dense_score > 0:
explanations.append(f"Semantic similarity: {dense_score:.3f}")
if sparse_score > 0:
explanations.append(f"Keyword match: {sparse_score:.3f}")
# Collection explanation
collection = result.get("collection")
if collection:
explanations.append(f"Source: {collection}")
# Metadata explanation
payload = result.get("payload", {})
doc_id = payload.get("doc_id")
if doc_id:
explanations.append(f"Document: {doc_id}")
confidence = payload.get("confidence")
if confidence:
explanations.append(f"Extraction confidence: {confidence:.3f}")
return "; ".join(explanations)
@app.get("/stats")
async def get_search_stats(
current_user: dict[str, Any] = Depends(get_current_user),
tenant_id: str = Depends(get_tenant_id),
) -> dict[str, Any]:
"""Get search statistics"""
try:
# This would aggregate metrics from Prometheus
# For now, return mock stats
stats = {
"total_searches": 1000,
"avg_results_per_search": 8.5,
"avg_confidence": 0.75,
"collections": {
"documents": {"searches": 800, "avg_confidence": 0.78},
"tax_rules": {"searches": 150, "avg_confidence": 0.85},
"guidance": {"searches": 50, "avg_confidence": 0.70},
},
"top_queries": [
{"query": "capital gains tax", "count": 45},
{"query": "business expenses", "count": 38},
{"query": "property income", "count": 32},
],
}
return stats
except Exception as e:
logger.error("Failed to get search stats", error=str(e))
raise HTTPException(status_code=500, detail="Failed to get stats")
@app.exception_handler(HTTPException)
async def http_exception_handler(request: Request, exc: HTTPException) -> JSONResponse:
"""Handle HTTP exceptions with RFC7807 format"""
return JSONResponse(
status_code=exc.status_code,
content=ErrorResponse(
type=f"https://httpstatuses.com/{exc.status_code}",
title=exc.detail,
status=exc.status_code,
detail=exc.detail,
instance=str(request.url),
trace_id=getattr(request.state, "trace_id", None),
).dict(),
)
if __name__ == "__main__":
import uvicorn
uvicorn.run("main:app", host="0.0.0.0", port=8007, reload=True, log_config=None)

View File

@@ -0,0 +1,11 @@
# Service-specific dependencies for svc_rag_retriever
# NOTE: ML dependencies (sentence-transformers, transformers, torch, numpy) are in base-ml image
# Search and ranking (lightweight)
rank-bm25>=0.2.2
# Vector similarity (CPU-only, lighter than GPU version)
faiss-cpu>=1.12.0
# Sparse retrieval
sparse-dot-topn>=1.1.5

View File

@@ -0,0 +1,53 @@
# Multi-stage build for svc_reason
FROM python:3.12-slim AS builder
# Install build dependencies
RUN apt-get update && apt-get install -y \
build-essential \
curl \
&& rm -rf /var/lib/apt/lists/*
# Create virtual environment
RUN python -m venv /opt/venv
ENV PATH="/opt/venv/bin:$PATH"
# Copy requirements and install dependencies
COPY libs/requirements-base.txt /tmp/libs-requirements.txt
COPY apps/svc_reason/requirements.txt /tmp/requirements.txt
RUN pip install --no-cache-dir --upgrade pip && \
pip install --no-cache-dir -r /tmp/libs-requirements.txt -r /tmp/requirements.txt
# Production stage
FROM python:3.12-slim
# Install runtime dependencies
RUN apt-get update && apt-get install -y \
curl \
&& rm -rf /var/lib/apt/lists/* \
&& groupadd -r appuser \
&& useradd -r -g appuser appuser
# Copy virtual environment from builder
COPY --from=builder /opt/venv /opt/venv
ENV PATH="/opt/venv/bin:$PATH"
# Set working directory
WORKDIR /app
# Copy application code
COPY libs/ ./libs/
COPY apps/svc_reason/ ./apps/svc_reason/
# Create non-root user and set permissions
RUN chown -R appuser:appuser /app
USER appuser
# Health check
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
CMD curl -f http://localhost:8000/healthz || exit 1
# Expose port
EXPOSE 8000
# Run the application
CMD ["python", "-m", "uvicorn", "apps.svc_reason.main:app", "--host", "0.0.0.0", "--port", "8000"]

677
apps/svc_reason/main.py Normal file
View File

@@ -0,0 +1,677 @@
"""Tax calculation engine with schedule computation and evidence trails."""
# mypy: disable-error-code=union-attr
# FILE: apps/svc-reason/main.py
# pylint: disable=wrong-import-position,import-error,too-few-public-methods,global-statement
# pylint: disable=global-variable-not-assigned,raise-missing-from,unused-argument
# pylint: disable=broad-exception-caught,no-else-return,too-many-arguments,too-many-positional-arguments
# pylint: disable=too-many-locals,import-outside-toplevel,too-many-statements
import os
# Import shared libraries
import sys
from datetime import datetime
from decimal import Decimal
from typing import Any
import structlog
import ulid
from fastapi import BackgroundTasks, Depends, HTTPException, Request
from fastapi.responses import JSONResponse
sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
from libs.app_factory import create_app
from libs.config import BaseAppSettings, create_event_bus, create_neo4j_client
from libs.events import EventBus, EventPayload, EventTopics
from libs.neo import Neo4jClient
from libs.observability import get_metrics, get_tracer, setup_observability
from libs.schemas import ErrorResponse, ScheduleComputeRequest, ScheduleComputeResponse
from libs.security import get_current_user, get_tenant_id
logger = structlog.get_logger()
class ReasonSettings(BaseAppSettings):
"""Settings for reasoning service"""
service_name: str = "svc-reason"
# Tax year configuration
current_tax_year: str = "2023-24"
supported_tax_years: list[str] = ["2021-22", "2022-23", "2023-24", "2024-25"]
# Calculation configuration
precision: int = 2 # Decimal places
rounding_method: str = "ROUND_HALF_UP"
# Schedule support
supported_schedules: list[str] = ["SA100", "SA103", "SA105", "SA106"]
# Validation
max_income: float = 10000000.0 # £10M
max_expenses: float = 10000000.0 # £10M
# Create app and settings
app, settings = create_app(
service_name="svc-reason",
title="Tax Agent Reasoning Service",
description="Tax calculation engine with schedule computation",
settings_class=ReasonSettings,
)
# Global clients
neo4j_client: Neo4jClient | None = None
event_bus: EventBus | None = None
tracer = get_tracer("svc-reason")
metrics = get_metrics()
@app.on_event("startup")
async def startup_event() -> None:
"""Initialize service dependencies"""
global neo4j_client, event_bus
logger.info("Starting reasoning service")
# Setup observability
setup_observability(settings)
# Initialize Neo4j client
neo4j_driver = create_neo4j_client(settings)
neo4j_client = Neo4jClient(neo4j_driver)
# Initialize event bus
event_bus = create_event_bus(settings)
await event_bus.start() # fmt: skip# pyright: ignore[reportOptionalMemberAccess]
# Subscribe to KG upsert events
await event_bus.subscribe(EventTopics.KG_UPSERTED, _handle_kg_upserted) # type: ignore
logger.info("Reasoning service started successfully")
@app.on_event("shutdown")
async def shutdown_event() -> None:
"""Cleanup service dependencies"""
global neo4j_client, event_bus
logger.info("Shutting down reasoning service")
if neo4j_client:
await neo4j_client.close()
if event_bus:
await event_bus.stop()
logger.info("Reasoning service shutdown complete")
@app.get("/health")
async def health_check() -> dict[str, Any]:
"""Health check endpoint"""
return {
"status": "healthy",
"service": settings.service_name,
"version": settings.service_version,
"timestamp": datetime.utcnow().isoformat(),
"supported_schedules": settings.supported_schedules,
}
@app.post("/compute", response_model=ScheduleComputeResponse)
async def compute_schedule(
request_data: ScheduleComputeRequest,
background_tasks: BackgroundTasks,
current_user: dict[str, Any] = Depends(get_current_user()),
tenant_id: str = Depends(get_tenant_id()),
) -> ScheduleComputeResponse:
"""Compute tax schedule"""
with tracer.start_as_current_span("compute_schedule") as span:
span.set_attribute("tax_year", request_data.tax_year)
span.set_attribute("taxpayer_id", request_data.taxpayer_id)
span.set_attribute("schedule_id", request_data.schedule_id)
span.set_attribute("tenant_id", tenant_id)
try:
# Validate inputs
if request_data.tax_year not in settings.supported_tax_years:
raise HTTPException(
status_code=400,
detail=f"Unsupported tax year: {request_data.tax_year}",
)
if request_data.schedule_id not in settings.supported_schedules:
raise HTTPException(
status_code=400,
detail=f"Unsupported schedule: {request_data.schedule_id}",
)
# Generate calculation ID
calculation_id = str(ulid.new())
span.set_attribute("calculation_id", calculation_id)
# Start background computation
background_tasks.add_task(
_compute_schedule_async,
request_data.tax_year,
request_data.taxpayer_id,
request_data.schedule_id,
tenant_id,
calculation_id,
current_user.get("sub", "system"),
)
logger.info(
"Schedule computation started",
calculation_id=calculation_id,
schedule=request_data.schedule_id,
)
return ScheduleComputeResponse(
calculation_id=calculation_id,
schedule=request_data.schedule_id,
form_boxes={}, # Will be populated when computation completes
evidence_trail=[],
)
except HTTPException:
raise
except Exception as e:
logger.error("Failed to start computation", error=str(e))
raise HTTPException(status_code=500, detail="Failed to start computation")
@app.get("/calculations/{calculation_id}")
async def get_calculation_results(
calculation_id: str,
current_user: dict[str, Any] = Depends(get_current_user()),
tenant_id: str = Depends(get_tenant_id()),
) -> dict[str, Any]:
"""Get calculation results"""
with tracer.start_as_current_span("get_calculation_results") as span:
span.set_attribute("calculation_id", calculation_id)
span.set_attribute("tenant_id", tenant_id)
try:
# Query calculation from Neo4j
query = """
MATCH (c:Calculation {calculation_id: $calculation_id, tenant_id: $tenant_id})
WHERE c.retracted_at IS NULL
RETURN c
"""
results = await neo4j_client.run_query( # pyright: ignore[reportOptionalMemberAccess]
query, {"calculation_id": calculation_id, "tenant_id": tenant_id}
)
if not results:
raise HTTPException(status_code=404, detail="Calculation not found")
calculation = results[0]["c"]
# Get form boxes
form_boxes_query = """
MATCH (c:Calculation {calculation_id: $calculation_id})-[:HAS_BOX]->(b:FormBox)
WHERE c.retracted_at IS NULL AND b.retracted_at IS NULL
RETURN b
"""
box_results = await neo4j_client.run_query( # pyright: ignore[reportOptionalMemberAccess]
form_boxes_query, {"calculation_id": calculation_id}
)
form_boxes = {}
for box_result in box_results:
box = box_result["b"]
form_boxes[box["box"]] = {
"value": box["value"],
"description": box.get("description"),
"confidence": box.get("confidence"),
}
return {
"calculation_id": calculation_id,
"schedule": calculation.get("schedule"),
"tax_year": calculation.get("tax_year"),
"status": calculation.get("status", "completed"),
"form_boxes": form_boxes,
"calculated_at": calculation.get("calculated_at"),
}
except HTTPException:
raise
except Exception as e:
logger.error(
"Failed to get calculation results",
calculation_id=calculation_id,
error=str(e),
)
raise HTTPException(
status_code=500, detail="Failed to get calculation results"
)
async def _handle_kg_upserted(topic: str, payload: EventPayload) -> None:
"""Handle KG upsert events for auto-calculation"""
try:
data = payload.data
entities = data.get("entities", [])
tenant_id = data.get("tenant_id")
# Check if we have enough data for calculation
has_income = any(e.get("type") == "IncomeItem" for e in entities)
has_expenses = any(e.get("type") == "ExpenseItem" for e in entities)
if has_income or has_expenses:
logger.info(
"Auto-triggering calculation due to new financial data",
tenant_id=tenant_id,
)
# Find taxpayer ID from entities
taxpayer_id = None
for entity in entities:
if entity.get("type") == "TaxpayerProfile":
taxpayer_id = entity.get("id")
break
if taxpayer_id:
await _compute_schedule_async(
tax_year=settings.current_tax_year,
taxpayer_id=taxpayer_id,
schedule_id="SA103", # Default to self-employment
tenant_id=tenant_id or "",
calculation_id=str(ulid.new()),
actor=payload.actor,
)
except Exception as e:
logger.error("Failed to handle KG upsert for auto-calculation", error=str(e))
async def _compute_schedule_async(
tax_year: str,
taxpayer_id: str,
schedule_id: str,
tenant_id: str,
calculation_id: str,
actor: str,
) -> None:
"""Compute schedule asynchronously"""
with tracer.start_as_current_span("compute_schedule_async") as span:
span.set_attribute("calculation_id", calculation_id)
span.set_attribute("schedule_id", schedule_id)
span.set_attribute("tax_year", tax_year)
try:
# Get relevant data from knowledge graph
financial_data = await _get_financial_data(taxpayer_id, tax_year, tenant_id)
# Perform calculations based on schedule
if schedule_id == "SA103":
form_boxes, evidence_trail = await _compute_sa103(
financial_data, tax_year
)
elif schedule_id == "SA105":
form_boxes, evidence_trail = await _compute_sa105(
financial_data, tax_year
)
elif schedule_id == "SA100":
form_boxes, evidence_trail = await _compute_sa100(
financial_data, tax_year
)
else:
raise ValueError(f"Unsupported schedule: {schedule_id}")
# Store calculation in knowledge graph
await _store_calculation(
calculation_id,
schedule_id,
tax_year,
taxpayer_id,
form_boxes,
evidence_trail,
tenant_id,
)
# Update metrics
metrics.counter("calculations_completed_total").labels(
tenant_id=tenant_id, schedule=schedule_id, tax_year=tax_year
).inc()
# Publish completion event
event_payload = EventPayload(
data={
"calculation_id": calculation_id,
"schedule": schedule_id,
"tax_year": tax_year,
"taxpayer_id": taxpayer_id,
"tenant_id": tenant_id,
"form_boxes": form_boxes,
"box_count": len(form_boxes),
},
actor=actor,
tenant_id=tenant_id,
)
await event_bus.publish(EventTopics.CALC_SCHEDULE_READY, event_payload) # type: ignore
logger.info(
"Schedule computation completed",
calculation_id=calculation_id,
schedule=schedule_id,
boxes=len(form_boxes),
)
except Exception as e:
logger.error(
"Schedule computation failed",
calculation_id=calculation_id,
error=str(e),
)
# Update error metrics
metrics.counter("calculation_errors_total").labels(
tenant_id=tenant_id, schedule=schedule_id, error_type=type(e).__name__
).inc()
async def _get_financial_data(
taxpayer_id: str, tax_year: str, tenant_id: str
) -> dict[str, Any]:
"""Get financial data from knowledge graph"""
# Get income items
income_query = """
MATCH (t:TaxpayerProfile {taxpayer_id: $taxpayer_id, tenant_id: $tenant_id})-[:HAS_INCOME]->(i:IncomeItem)
WHERE i.retracted_at IS NULL
AND i.tax_year = $tax_year
RETURN i
"""
income_results = (
await neo4j_client.run_query( # pyright: ignore[reportOptionalMemberAccess]
income_query,
{"taxpayer_id": taxpayer_id, "tax_year": tax_year, "tenant_id": tenant_id},
)
)
# Get expense items
expense_query = """
MATCH (t:TaxpayerProfile {taxpayer_id: $taxpayer_id, tenant_id: $tenant_id})-[:HAS_EXPENSE]->(e:ExpenseItem)
WHERE e.retracted_at IS NULL
AND e.tax_year = $tax_year
RETURN e
"""
expense_results = (
await neo4j_client.run_query( # pyright: ignore[reportOptionalMemberAccess]
expense_query,
{"taxpayer_id": taxpayer_id, "tax_year": tax_year, "tenant_id": tenant_id},
)
)
return {
"income_items": [result["i"] for result in income_results],
"expense_items": [result["e"] for result in expense_results],
"tax_year": tax_year,
"taxpayer_id": taxpayer_id,
}
async def _compute_sa103(
financial_data: dict[str, Any], tax_year: str
) -> tuple[dict[str, Any], list[dict[str, Any]]]:
"""Compute SA103 (Self-employment) schedule"""
income_items = financial_data.get("income_items", [])
expense_items = financial_data.get("expense_items", [])
# Calculate totals
total_turnover = Decimal("0")
total_expenses = Decimal("0")
evidence_trail = []
# Sum income
for income in income_items:
if income.get("type") == "self_employment":
amount = Decimal(str(income.get("gross", 0)))
total_turnover += amount
evidence_trail.append(
{
"box": "20",
"source_entity": income.get("income_id"),
"amount": float(amount),
"description": f"Income: {income.get('description', 'Unknown')}",
}
)
# Sum expenses
for expense in expense_items:
if expense.get("allowable", True):
amount = Decimal(str(expense.get("amount", 0)))
total_expenses += amount
evidence_trail.append(
{
"box": "31",
"source_entity": expense.get("expense_id"),
"amount": float(amount),
"description": f"Expense: {expense.get('description', 'Unknown')}",
}
)
# Calculate net profit
net_profit = total_turnover - total_expenses
# Create form boxes
form_boxes = {
"20": {
"value": float(total_turnover),
"description": "Total turnover",
"confidence": 0.9,
},
"31": {
"value": float(total_expenses),
"description": "Total allowable business expenses",
"confidence": 0.9,
},
"32": {
"value": float(net_profit),
"description": "Net profit",
"confidence": 0.9,
},
}
return form_boxes, evidence_trail
async def _compute_sa105(
financial_data: dict[str, Any], tax_year: str
) -> tuple[dict[str, Any], list[dict[str, Any]]]:
"""Compute SA105 (Property income) schedule"""
income_items = financial_data.get("income_items", [])
expense_items = financial_data.get("expense_items", [])
# Calculate property income and expenses
total_rents = Decimal("0")
total_property_expenses = Decimal("0")
evidence_trail = []
# Sum property income
for income in income_items:
if income.get("type") == "property":
amount = Decimal(str(income.get("gross", 0)))
total_rents += amount
evidence_trail.append(
{
"box": "20",
"source_entity": income.get("income_id"),
"amount": float(amount),
"description": f"Property income: {income.get('description', 'Unknown')}",
}
)
# Sum property expenses
for expense in expense_items:
if expense.get("type") == "property" and expense.get("allowable", True):
amount = Decimal(str(expense.get("amount", 0)))
total_property_expenses += amount
# Map to appropriate SA105 box based on expense category
box = _map_property_expense_to_box(expense.get("category", "other"))
evidence_trail.append(
{
"box": box,
"source_entity": expense.get("expense_id"),
"amount": float(amount),
"description": f"Property expense: {expense.get('description', 'Unknown')}",
}
)
# Calculate net property income
net_property_income = total_rents - total_property_expenses
form_boxes = {
"20": {
"value": float(total_rents),
"description": "Total rents and other income",
"confidence": 0.9,
},
"38": {
"value": float(total_property_expenses),
"description": "Total property expenses",
"confidence": 0.9,
},
"net_income": {
"value": float(net_property_income),
"description": "Net property income",
"confidence": 0.9,
},
}
return form_boxes, evidence_trail
async def _compute_sa100(
financial_data: dict[str, Any], tax_year: str
) -> tuple[dict[str, Any], list[dict[str, Any]]]:
"""Compute SA100 (Main return) schedule"""
# This would aggregate from other schedules
# For now, return basic structure
form_boxes = {
"1": {"value": "John Doe", "description": "Your name", "confidence": 0.9}
}
evidence_trail: list[dict[str, Any]] = []
return form_boxes, evidence_trail
def _map_property_expense_to_box(category: str) -> str:
"""Map property expense category to SA105 box"""
mapping = {
"rent_rates_insurance": "31",
"property_management": "32",
"services_wages": "33",
"repairs_maintenance": "34",
"finance_costs": "35",
"professional_fees": "36",
"costs_of_services": "37",
"other": "38",
}
return mapping.get(category, "38")
async def _store_calculation(
calculation_id: str,
schedule: str,
tax_year: str,
taxpayer_id: str,
form_boxes: dict[str, Any],
evidence_trail: list[dict[str, Any]],
tenant_id: str,
) -> None:
"""Store calculation results in knowledge graph"""
# Create calculation node
calc_properties = {
"calculation_id": calculation_id,
"schedule": schedule,
"tax_year": tax_year,
"taxpayer_id": taxpayer_id,
"tenant_id": tenant_id,
"calculated_at": datetime.utcnow().isoformat(),
"status": "completed",
"source": "reasoning_engine",
"extractor_version": "1.0.0",
"valid_from": datetime.utcnow(),
"asserted_at": datetime.utcnow(),
}
await neo4j_client.create_node("Calculation", calc_properties) # fmt: skip # pyright: ignore[reportOptionalMemberAccess]
# Create form box nodes
for box_id, box_data in form_boxes.items():
box_properties = {
"form": schedule,
"box": box_id,
"value": box_data["value"],
"description": box_data.get("description"),
"confidence": box_data.get("confidence"),
"calculation_id": calculation_id,
"tenant_id": tenant_id,
"source": "reasoning_engine",
"extractor_version": "1.0.0",
"valid_from": datetime.utcnow(),
"asserted_at": datetime.utcnow(),
}
await neo4j_client.create_node("FormBox", box_properties) # fmt: skip # pyright: ignore[reportOptionalMemberAccess]
# Create relationship
await neo4j_client.create_relationship( # pyright: ignore[reportOptionalMemberAccess]
"Calculation",
calculation_id,
"FormBox",
f"{calculation_id}_{box_id}",
"HAS_BOX",
)
@app.exception_handler(HTTPException)
async def http_exception_handler(request: Request, exc: HTTPException) -> JSONResponse:
"""Handle HTTP exceptions with RFC7807 format"""
return JSONResponse(
status_code=exc.status_code,
content=ErrorResponse(
type=f"https://httpstatuses.com/{exc.status_code}",
title=exc.detail,
status=exc.status_code,
detail=exc.detail,
instance=str(request.url),
trace_id=getattr(request.state, "trace_id", None),
).model_dump(),
)
if __name__ == "__main__":
import uvicorn
uvicorn.run("main:app", host="0.0.0.0", port=8008, reload=True, log_config=None)

View File

@@ -0,0 +1,35 @@
# FastAPI and server
fastapi>=0.104.1
uvicorn[standard]>=0.24.0
pydantic>=2.5.0
# Service-specific dependencies
# Mathematical calculations
# decimal is part of Python standard library
sympy>=1.12.0
# Tax calculations
numpy>=2.3.3
pandas>=2.1.0
# Date and time calculations
python-dateutil>=2.8.0
pytz>=2023.3
# UK tax specific
# uk-tax-calculator>=1.0.0 # Package may not exist, commenting out
# Business rules engine
# python-rules>=1.3.0 # Package may not exist, commenting out
# Financial calculations
# quantlib>=1.32.0 # Package may not exist, commenting out
# Data validation
cerberus>=1.3.4
# Template processing for explanations
jinja2>=3.1.0
# Statistical calculations
scipy>=1.11.0

53
apps/svc_rpa/Dockerfile Normal file
View File

@@ -0,0 +1,53 @@
# Multi-stage build for svc_rpa
FROM python:3.12-slim AS builder
# Install build dependencies
RUN apt-get update && apt-get install -y \
build-essential \
curl \
&& rm -rf /var/lib/apt/lists/*
# Create virtual environment
RUN python -m venv /opt/venv
ENV PATH="/opt/venv/bin:$PATH"
# Copy requirements and install dependencies
COPY libs/requirements-base.txt /tmp/libs-requirements.txt
COPY apps/svc_rpa/requirements.txt /tmp/requirements.txt
RUN pip install --no-cache-dir --upgrade pip && \
pip install --no-cache-dir -r /tmp/libs-requirements.txt -r /tmp/requirements.txt
# Production stage
FROM python:3.12-slim
# Install runtime dependencies
RUN apt-get update && apt-get install -y \
curl \
&& rm -rf /var/lib/apt/lists/* \
&& groupadd -r appuser \
&& useradd -r -g appuser appuser
# Copy virtual environment from builder
COPY --from=builder /opt/venv /opt/venv
ENV PATH="/opt/venv/bin:$PATH"
# Set working directory
WORKDIR /app
# Copy application code
COPY libs/ ./libs/
COPY apps/svc_rpa/ ./apps/svc_rpa/
# Create non-root user and set permissions
RUN chown -R appuser:appuser /app
USER appuser
# Health check
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
CMD curl -f http://localhost:8000/healthz || exit 1
# Expose port
EXPOSE 8000
# Run the application
CMD ["python", "-m", "uvicorn", "apps.svc_rpa.main:app", "--host", "0.0.0.0", "--port", "8000"]

524
apps/svc_rpa/main.py Normal file
View File

@@ -0,0 +1,524 @@
# FILE: apps/svc-rpa/main.py
# mypy: disable-error-code=union-attr
# Playwright automation for portal data extraction (HMRC, banks, etc.)
import asyncio
import os
# Import shared libraries
import sys
from datetime import datetime
from typing import Any
import structlog
import ulid
from fastapi import BackgroundTasks, Depends, HTTPException, Request
from fastapi.responses import JSONResponse
from playwright.async_api import Browser, Page, async_playwright
sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
from libs.app_factory import create_app
from libs.config import BaseAppSettings, create_event_bus, create_vault_client
from libs.events import EventBus, EventPayload
from libs.observability import get_metrics, get_tracer, setup_observability
from libs.schemas import ErrorResponse
from libs.security import VaultTransitHelper, get_current_user, get_tenant_id
logger = structlog.get_logger()
class RPASettings(BaseAppSettings):
"""Settings for RPA service"""
service_name: str = "svc-rpa"
# Browser configuration
browser_type: str = "chromium" # chromium, firefox, webkit
headless: bool = True
timeout: int = 30000 # 30 seconds
# Portal configurations
hmrc_base_url: str = "https://www.gov.uk/log-in-hmrc-online-services"
open_banking_enabled: bool = False
# Security
max_concurrent_sessions: int = 5
session_timeout: int = 300 # 5 minutes
# Create app and settings
app, settings = create_app(
service_name="svc-rpa",
title="Tax Agent RPA Service",
description="Robotic Process Automation for portal data extraction",
settings_class=RPASettings,
)
# Global clients
vault_helper: VaultTransitHelper | None = None
event_bus: EventBus | None = None
browser: Browser | None = None
active_sessions: dict[str, dict[str, Any]] = {}
tracer = get_tracer("svc-rpa")
metrics = get_metrics()
@app.on_event("startup")
async def startup_event() -> None:
"""Initialize service dependencies"""
global vault_helper, event_bus, browser
logger.info("Starting RPA service")
# Setup observability
setup_observability(settings)
# Initialize Vault helper
vault_client = create_vault_client(settings)
vault_helper = VaultTransitHelper(vault_client, "tax-agent-transit")
# Initialize event bus
event_bus = create_event_bus(settings)
await event_bus.start() # fmt: skip # pyright: ignore[reportOptionalMemberAccess]
# Initialize browser
playwright = await async_playwright().start()
browser = await playwright[settings.browser_type].launch(
headless=settings.headless,
args=["--no-sandbox", "--disable-dev-shm-usage"] if settings.headless else [],
)
logger.info("RPA service started successfully")
@app.on_event("shutdown")
async def shutdown_event() -> None:
"""Cleanup service dependencies"""
global event_bus, browser
logger.info("Shutting down RPA service")
if browser:
await browser.close()
if event_bus:
await event_bus.stop()
logger.info("RPA service shutdown complete")
@app.get("/health")
async def health_check() -> dict[str, Any]:
"""Health check endpoint"""
return {
"status": "healthy",
"service": settings.service_name,
"version": settings.service_version,
"timestamp": datetime.utcnow().isoformat(),
"active_sessions": len(active_sessions),
}
@app.post("/sessions")
async def create_session(
portal: str,
background_tasks: BackgroundTasks,
current_user: dict[str, Any] = Depends(get_current_user),
tenant_id: str = Depends(get_tenant_id),
) -> dict[str, Any]:
"""Create new RPA session"""
with tracer.start_as_current_span("create_session") as span:
span.set_attribute("portal", portal)
span.set_attribute("tenant_id", tenant_id)
try:
# Check session limits
if len(active_sessions) >= settings.max_concurrent_sessions:
raise HTTPException(status_code=429, detail="Too many active sessions")
# Generate session ID
session_id = str(ulid.new())
span.set_attribute("session_id", session_id)
# Create browser context
context = await browser.new_context( # pyright: ignore[reportOptionalMemberAccess]
viewport={"width": 1920, "height": 1080},
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
)
page = await context.new_page()
# Store session
active_sessions[session_id] = {
"context": context,
"page": page,
"portal": portal,
"tenant_id": tenant_id,
"user_id": current_user.get("sub"),
"created_at": datetime.utcnow(),
"last_activity": datetime.utcnow(),
}
# Schedule session cleanup
background_tasks.add_task(
_cleanup_session_after_timeout, session_id, settings.session_timeout
)
logger.info("RPA session created", session_id=session_id, portal=portal)
return {
"session_id": session_id,
"portal": portal,
"status": "created",
"expires_at": (
datetime.utcnow().timestamp() + settings.session_timeout
),
}
except Exception as e:
logger.error("Failed to create session", error=str(e))
raise HTTPException(status_code=500, detail="Failed to create session")
@app.post("/sessions/{session_id}/navigate")
async def navigate_to_url(
session_id: str,
url: str,
current_user: dict[str, Any] = Depends(get_current_user),
tenant_id: str = Depends(get_tenant_id),
) -> dict[str, Any]:
"""Navigate to URL in session"""
with tracer.start_as_current_span("navigate") as span:
span.set_attribute("session_id", session_id)
span.set_attribute("url", url)
try:
session = _get_session(session_id, tenant_id)
page = session["page"]
# Navigate to URL
response = await page.goto(url, timeout=settings.timeout)
# Update last activity
session["last_activity"] = datetime.utcnow()
# Take screenshot for debugging
await page.screenshot()
logger.info(
"Navigated to URL",
session_id=session_id,
url=url,
status=response.status,
)
return {
"status": "success",
"url": page.url,
"title": await page.title(),
"response_status": response.status,
}
except Exception as e:
logger.error(
"Navigation failed", session_id=session_id, url=url, error=str(e)
)
raise HTTPException(status_code=500, detail=f"Navigation failed: {str(e)}")
@app.post("/sessions/{session_id}/login")
async def login_to_portal(
session_id: str,
credentials: dict[str, str],
current_user: dict[str, Any] = Depends(get_current_user),
tenant_id: str = Depends(get_tenant_id),
) -> dict[str, Any]:
"""Login to portal using encrypted credentials"""
with tracer.start_as_current_span("login") as span:
span.set_attribute("session_id", session_id)
try:
session = _get_session(session_id, tenant_id)
page = session["page"]
portal = session["portal"]
# Decrypt credentials
decrypted_credentials: dict[str, Any] = {}
for key, encrypted_value in credentials.items():
decrypted_credentials[key] = (
vault_helper.decrypt_field( # pyright: ignore[reportOptionalMemberAccess]
key_name=key, ciphertext=encrypted_value
)
)
# Perform login based on portal type
if portal == "hmrc":
success = await _login_hmrc(page, decrypted_credentials)
elif portal == "open_banking":
success = await _login_open_banking(page, decrypted_credentials)
else:
raise ValueError(f"Unsupported portal: {portal}")
# Update session
session["last_activity"] = datetime.utcnow()
session["authenticated"] = success
if success:
logger.info("Login successful", session_id=session_id, portal=portal)
return {"status": "success", "authenticated": True}
else:
logger.warning("Login failed", session_id=session_id, portal=portal)
return {"status": "failed", "authenticated": False}
except Exception as e:
logger.error("Login error", session_id=session_id, error=str(e))
raise HTTPException(status_code=500, detail=f"Login failed: {str(e)}")
@app.post("/sessions/{session_id}/extract")
async def extract_data(
session_id: str,
extraction_config: dict[str, Any],
current_user: dict[str, Any] = Depends(get_current_user),
tenant_id: str = Depends(get_tenant_id),
) -> dict[str, Any]:
"""Extract data from portal"""
with tracer.start_as_current_span("extract_data") as span:
span.set_attribute("session_id", session_id)
try:
session = _get_session(session_id, tenant_id)
page = session["page"]
portal = session["portal"]
# Check authentication
if not session.get("authenticated", False):
raise HTTPException(status_code=401, detail="Session not authenticated")
# Extract data based on portal and config
if portal == "hmrc":
extracted_data = await _extract_hmrc_data(page, extraction_config)
elif portal == "open_banking":
extracted_data = await _extract_banking_data(page, extraction_config)
else:
raise ValueError(f"Unsupported portal: {portal}")
# Update session
session["last_activity"] = datetime.utcnow()
# Publish extraction event
event_payload = EventPayload(
data={
"session_id": session_id,
"portal": portal,
"extraction_config": extraction_config,
"extracted_data": extracted_data,
"tenant_id": tenant_id,
},
actor=current_user.get("sub", "system"),
tenant_id=tenant_id,
trace_id=span.get_span_context().trace_id,
)
await event_bus.publish("rpa.data_extracted", event_payload) # fmt: skip # pyright: ignore[reportOptionalMemberAccess]
logger.info(
"Data extracted",
session_id=session_id,
portal=portal,
records_count=len(extracted_data.get("records", [])),
)
return {
"status": "success",
"extracted_data": extracted_data,
"records_count": len(extracted_data.get("records", [])),
}
except Exception as e:
logger.error("Data extraction failed", session_id=session_id, error=str(e))
raise HTTPException(status_code=500, detail=f"Extraction failed: {str(e)}")
@app.delete("/sessions/{session_id}")
async def close_session(
session_id: str,
current_user: dict[str, Any] = Depends(get_current_user),
tenant_id: str = Depends(get_tenant_id),
) -> dict[str, str]:
"""Close RPA session"""
with tracer.start_as_current_span("close_session") as span:
span.set_attribute("session_id", session_id)
try:
session = _get_session(session_id, tenant_id)
# Close browser context
await session["context"].close()
# Remove from active sessions
del active_sessions[session_id]
logger.info("Session closed", session_id=session_id)
return {"status": "closed"}
except Exception as e:
logger.error("Failed to close session", session_id=session_id, error=str(e))
raise HTTPException(status_code=500, detail="Failed to close session")
def _get_session(session_id: str, tenant_id: str) -> dict[str, Any]:
"""Get and validate session"""
if session_id not in active_sessions:
raise HTTPException(status_code=404, detail="Session not found")
session = active_sessions[session_id]
# Check tenant access
if session["tenant_id"] != tenant_id:
raise HTTPException(status_code=403, detail="Access denied")
# Check timeout
if (
datetime.utcnow() - session["last_activity"]
).seconds > settings.session_timeout:
raise HTTPException(status_code=408, detail="Session expired")
return session
async def _login_hmrc(page: Page, credentials: dict[str, str]) -> bool:
"""Login to HMRC portal"""
try:
# Navigate to HMRC login
await page.goto(settings.hmrc_base_url)
# Wait for login form
await page.wait_for_selector('input[name="userId"]', timeout=settings.timeout)
# Fill credentials
await page.fill('input[name="userId"]', credentials.get("user_id", ""))
await page.fill('input[name="password"]', credentials.get("password", ""))
# Submit form
await page.click('button[type="submit"]')
# Wait for redirect or error
await page.wait_for_load_state("networkidle")
# Check if login was successful
current_url = page.url
return "sign-in" not in current_url.lower()
except Exception as e:
logger.error("HMRC login failed", error=str(e))
return False
async def _login_open_banking(page: Page, credentials: dict[str, str]) -> bool:
"""Login to Open Banking portal"""
try:
# This would implement Open Banking login flow
# For now, return False as it's not implemented
logger.warning("Open Banking login not implemented")
return False
except Exception as e:
logger.error("Open Banking login failed", error=str(e))
return False
async def _extract_hmrc_data(page: Page, config: dict[str, Any]) -> dict[str, Any]:
"""Extract data from HMRC portal"""
try:
data_type = config.get("data_type", "tax_returns")
tax_year = config.get("tax_year", "2023-24")
extracted_data = {
"data_type": data_type,
"tax_year": tax_year,
"records": [],
"extracted_at": datetime.utcnow().isoformat(),
}
if data_type == "tax_returns":
# Navigate to tax returns section
await page.click('a[href*="tax-return"]')
await page.wait_for_load_state("networkidle")
# Extract return data
returns = await page.query_selector_all(".tax-return-item")
for return_element in returns:
return_data = await return_element.evaluate(
"""
element => ({
year: element.querySelector('.tax-year')?.textContent?.trim(),
status: element.querySelector('.status')?.textContent?.trim(),
amount: element.querySelector('.amount')?.textContent?.trim()
})
"""
)
extracted_data["records"].append(return_data)
return extracted_data
except Exception as e:
logger.error("HMRC data extraction failed", error=str(e))
return {"error": str(e), "records": []}
async def _extract_banking_data(page: Page, config: dict[str, Any]) -> dict[str, Any]:
"""Extract banking data via Open Banking"""
try:
# This would implement Open Banking data extraction
logger.warning("Open Banking extraction not implemented")
return {"error": "Not implemented", "records": []}
except Exception as e:
logger.error("Banking data extraction failed", error=str(e))
return {"error": str(e), "records": []}
async def _cleanup_session_after_timeout(session_id: str, timeout_seconds: int) -> None:
"""Cleanup session after timeout"""
await asyncio.sleep(timeout_seconds)
if session_id in active_sessions:
try:
session = active_sessions[session_id]
await session["context"].close()
del active_sessions[session_id]
logger.info("Session cleaned up due to timeout", session_id=session_id)
except Exception as e:
logger.error(
"Failed to cleanup session", session_id=session_id, error=str(e)
)
@app.exception_handler(HTTPException)
async def http_exception_handler(request: Request, exc: HTTPException) -> JSONResponse:
"""Handle HTTP exceptions with RFC7807 format"""
return JSONResponse(
status_code=exc.status_code,
content=ErrorResponse(
type=f"https://httpstatuses.com/{exc.status_code}",
title=exc.detail,
status=exc.status_code,
detail=exc.detail,
instance=str(request.url),
trace_id="",
).model_dump(),
)
if __name__ == "__main__":
import uvicorn
uvicorn.run("main:app", host="0.0.0.0", port=8001, reload=True, log_config=None)

View File

@@ -0,0 +1,17 @@
# FastAPI and server
fastapi>=0.104.1
uvicorn[standard]>=0.24.0
pydantic>=2.5.0
# Service-specific dependencies
# Browser automation
playwright>=1.40.0
# Additional async utilities
# asyncio-timeout>=4.0.3 # Deprecated, use asyncio.timeout from Python 3.11+ standard library
# Session management
aioredis>=2.0.1
# Browser management
psutil>=5.9.0