Files
ai-tax-agent/apps/svc_forms/main.py
harkon b324ff09ef
Some checks failed
CI/CD Pipeline / Code Quality & Linting (push) Has been cancelled
CI/CD Pipeline / Policy Validation (push) Has been cancelled
CI/CD Pipeline / Test Suite (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-firm-connectors) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-forms) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-hmrc) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ingestion) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-normalize-map) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ocr) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-indexer) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-reason) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rpa) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (ui-review) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (ui-review) (push) Has been cancelled
CI/CD Pipeline / Generate SBOM (push) Has been cancelled
CI/CD Pipeline / Deploy to Staging (push) Has been cancelled
CI/CD Pipeline / Deploy to Production (push) Has been cancelled
CI/CD Pipeline / Notifications (push) Has been cancelled
Initial commit
2025-10-11 08:41:36 +01:00

626 lines
20 KiB
Python

"""PDF form filling with evidence pack generation."""
# FILE: apps/svc-forms/main.py
# pylint: disable=wrong-import-position,import-error,too-few-public-methods,global-statement
# pylint: disable=global-variable-not-assigned,raise-missing-from,unused-argument
# pylint: disable=broad-exception-caught,no-else-return,too-many-arguments,too-many-positional-arguments
# pylint: disable=too-many-locals,import-outside-toplevel
# mypy: disable-error-code=union-attr
import os
# Import shared libraries
import sys
from datetime import datetime
from io import BytesIO
from typing import Any
import structlog
import ulid
from fastapi import BackgroundTasks, Depends, HTTPException, Request
from fastapi.responses import JSONResponse, Response
sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
from libs.app_factory import create_app
from libs.config import (
BaseAppSettings,
create_event_bus,
create_minio_client,
create_neo4j_client,
)
from libs.events import EventBus, EventPayload, EventTopics
from libs.forms import UK_TAX_FORMS, EvidencePackGenerator, PDFFormFiller
from libs.neo import Neo4jClient
from libs.observability import get_metrics, get_tracer, setup_observability
from libs.schemas import ErrorResponse
from libs.security import get_current_user, get_tenant_id
from libs.storage import DocumentStorage, StorageClient
logger = structlog.get_logger()
class FormsSettings(BaseAppSettings):
"""Settings for forms service"""
service_name: str = "svc-forms"
# Form templates
forms_template_dir: str = "forms/templates"
output_bucket: str = "filled-forms"
evidence_packs_bucket: str = "evidence-packs"
# Supported forms
supported_forms: list[str] = ["SA100", "SA103", "SA105", "SA106"]
# PDF configuration
pdf_quality: str = "high"
flatten_forms: bool = True
# Create app and settings
app, settings = create_app(
service_name="svc-forms",
title="Tax Agent Forms Service",
description="PDF form filling and evidence pack generation",
settings_class=FormsSettings,
)
# Global clients
storage_client: StorageClient | None = None
document_storage: DocumentStorage | None = None
neo4j_client: Neo4jClient | None = None
pdf_form_filler: PDFFormFiller | None = None
evidence_pack_generator: EvidencePackGenerator | None = None
event_bus: EventBus | None = None
tracer = get_tracer("svc-forms")
metrics = get_metrics()
@app.on_event("startup")
async def startup_event() -> None:
"""Initialize service dependencies"""
global storage_client, document_storage, neo4j_client, pdf_form_filler # pylint: disable=line-too-long
global evidence_pack_generator, event_bus
logger.info("Starting forms service")
# Setup observability
setup_observability(settings)
# Initialize MinIO client
minio_client = create_minio_client(settings)
storage_client = StorageClient(minio_client)
document_storage = DocumentStorage(storage_client)
# Initialize Neo4j client
neo4j_driver = create_neo4j_client(settings)
neo4j_client = Neo4jClient(neo4j_driver)
# Initialize PDF form filler
pdf_form_filler = PDFFormFiller()
# Load form templates
for form_id in settings.supported_forms:
template_path = os.path.join(settings.forms_template_dir, f"{form_id}.pdf")
if os.path.exists(template_path):
pdf_form_filler.load_template(form_id, template_path)
else:
logger.warning(
"Form template not found", form_id=form_id, path=template_path
)
# Initialize evidence pack generator
evidence_pack_generator = EvidencePackGenerator(storage_client)
# Initialize event bus
event_bus = create_event_bus(settings)
await event_bus.start() # fmt: skip # pyright: ignore[reportOptionalMemberAccess]
# Subscribe to calculation completion events
await event_bus.subscribe( # fmt: skip # pyright: ignore[reportOptionalMemberAccess]
EventTopics.CALC_SCHEDULE_READY, _handle_calculation_ready
)
# Ensure buckets exist
await storage_client.ensure_bucket(settings.output_bucket)
await storage_client.ensure_bucket(settings.evidence_packs_bucket)
logger.info("Forms service started successfully")
@app.on_event("shutdown")
async def shutdown_event() -> None:
"""Cleanup service dependencies"""
global neo4j_client, event_bus
logger.info("Shutting down forms service")
if neo4j_client:
await neo4j_client.close()
if event_bus:
await event_bus.stop()
logger.info("Forms service shutdown complete")
@app.get("/health")
async def health_check() -> dict[str, Any]:
"""Health check endpoint"""
return {
"status": "healthy",
"service": settings.service_name,
"version": "1.0.0",
"timestamp": datetime.now().isoformat(),
"supported_forms": settings.supported_forms,
}
@app.post("/fill/{form_id}")
async def fill_form(
form_id: str,
field_values: dict[str, Any],
background_tasks: BackgroundTasks,
current_user: dict[str, Any] = Depends(get_current_user),
tenant_id: str = Depends(get_tenant_id),
) -> dict[str, Any]:
"""Fill PDF form with provided values"""
with tracer.start_as_current_span("fill_form") as span:
span.set_attribute("form_id", form_id)
span.set_attribute("tenant_id", tenant_id)
span.set_attribute("field_count", len(field_values))
try:
# Validate form ID
if form_id not in settings.supported_forms:
raise HTTPException(
status_code=400, detail=f"Unsupported form: {form_id}"
)
# Generate filling ID
filling_id = str(ulid.new())
span.set_attribute("filling_id", filling_id)
# Start background form filling
background_tasks.add_task(
_fill_form_async,
form_id,
field_values,
tenant_id,
filling_id,
current_user.get("sub", "system"),
)
logger.info("Form filling started", form_id=form_id, filling_id=filling_id)
return {
"filling_id": filling_id,
"form_id": form_id,
"status": "filling",
"field_count": len(field_values),
}
except HTTPException:
raise
except Exception as e:
logger.error("Failed to start form filling", form_id=form_id, error=str(e))
raise HTTPException(status_code=500, detail="Failed to start form filling")
@app.post("/fill-from-calculation/{calculation_id}")
async def fill_form_from_calculation(
calculation_id: str,
background_tasks: BackgroundTasks,
current_user: dict[str, Any] = Depends(get_current_user),
tenant_id: str = Depends(get_tenant_id),
) -> dict[str, Any]:
"""Fill form using calculation results"""
with tracer.start_as_current_span("fill_form_from_calculation") as span:
span.set_attribute("calculation_id", calculation_id)
span.set_attribute("tenant_id", tenant_id)
try:
# Get calculation from Neo4j
calc_query = """
MATCH (c:Calculation {calculation_id: $calculation_id, tenant_id: $tenant_id})
WHERE c.retracted_at IS NULL
RETURN c
"""
calc_results = await neo4j_client.run_query( # pyright: ignore[reportOptionalMemberAccess]
calc_query, {"calculation_id": calculation_id, "tenant_id": tenant_id}
)
if not calc_results:
raise HTTPException(status_code=404, detail="Calculation not found")
calculation = calc_results[0]["c"]
form_id = calculation.get("schedule")
if not form_id:
raise HTTPException(
status_code=400, detail="No schedule found in calculation"
)
# Get form boxes
boxes_query = """
MATCH (c:Calculation {calculation_id: $calculation_id})-[:HAS_BOX]->(b:FormBox)
WHERE c.retracted_at IS NULL AND b.retracted_at IS NULL
RETURN b
"""
box_results = await neo4j_client.run_query( # pyright: ignore[reportOptionalMemberAccess]
boxes_query, {"calculation_id": calculation_id}
)
# Convert form boxes to field values
field_values = {}
for box_result in box_results:
box = box_result["b"]
field_values[f"box_{box['box']}"] = box["value"]
# Generate filling ID
filling_id = str(ulid.new())
span.set_attribute("filling_id", filling_id)
span.set_attribute("form_id", form_id)
# Start background form filling
background_tasks.add_task(
_fill_form_async,
form_id,
field_values,
tenant_id,
filling_id,
current_user.get("sub", "system"),
calculation_id,
)
logger.info(
"Form filling from calculation started",
form_id=form_id,
filling_id=filling_id,
calculation_id=calculation_id,
)
return {
"filling_id": filling_id,
"form_id": form_id,
"calculation_id": calculation_id,
"status": "filling",
"field_count": len(field_values),
}
except HTTPException:
raise
except Exception as e:
logger.error(
"Failed to fill form from calculation",
calculation_id=calculation_id,
error=str(e),
)
raise HTTPException(
status_code=500, detail="Failed to fill form from calculation"
)
@app.get("/download/{filling_id}")
async def download_filled_form(
filling_id: str,
current_user: dict[str, Any] = Depends(get_current_user),
tenant_id: str = Depends(get_tenant_id),
) -> Response:
"""Download filled form"""
with tracer.start_as_current_span("download_filled_form") as span:
span.set_attribute("filling_id", filling_id)
span.set_attribute("tenant_id", tenant_id)
try:
# Get filled form from storage
object_key = f"tenants/{tenant_id}/filled/{filling_id}.pdf"
form_content = await storage_client.get_object( # pyright: ignore[reportOptionalMemberAccess]
settings.output_bucket, object_key
)
if not form_content:
raise HTTPException(status_code=404, detail="Filled form not found")
return Response(
content=form_content,
media_type="application/pdf",
headers={
"Content-Disposition": f"attachment; filename={filling_id}.pdf"
},
)
except HTTPException:
raise
except Exception as e:
logger.error(
"Failed to download filled form", filling_id=filling_id, error=str(e)
)
raise HTTPException(
status_code=500, detail="Failed to download filled form"
)
@app.post("/evidence-pack")
async def create_evidence_pack(
taxpayer_id: str,
tax_year: str,
scope: str,
evidence_items: list[dict[str, Any]],
background_tasks: BackgroundTasks,
current_user: dict[str, Any] = Depends(get_current_user),
tenant_id: str = Depends(get_tenant_id),
) -> dict[str, Any]:
"""Create evidence pack with supporting documents"""
with tracer.start_as_current_span("create_evidence_pack") as span:
span.set_attribute("taxpayer_id", taxpayer_id)
span.set_attribute("tax_year", tax_year)
span.set_attribute("scope", scope)
span.set_attribute("tenant_id", tenant_id)
span.set_attribute("evidence_count", len(evidence_items))
try:
# Generate pack ID
pack_id = str(ulid.new())
span.set_attribute("pack_id", pack_id)
# Start background pack creation
background_tasks.add_task(
_create_evidence_pack_async,
taxpayer_id,
tax_year,
scope,
evidence_items,
tenant_id,
pack_id,
current_user.get("sub", "system"),
)
logger.info(
"Evidence pack creation started",
pack_id=pack_id,
taxpayer_id=taxpayer_id,
scope=scope,
)
return {
"pack_id": pack_id,
"taxpayer_id": taxpayer_id,
"tax_year": tax_year,
"scope": scope,
"status": "creating",
"evidence_count": len(evidence_items),
}
except Exception as e:
logger.error("Failed to start evidence pack creation", error=str(e))
raise HTTPException(
status_code=500, detail="Failed to start evidence pack creation"
)
@app.get("/forms")
async def list_supported_forms(
current_user: dict[str, Any] = Depends(get_current_user),
tenant_id: str = Depends(get_tenant_id),
) -> dict[str, Any]:
"""List supported forms with field information"""
try:
forms_info = []
for form_id in settings.supported_forms:
form_config = UK_TAX_FORMS.get(form_id, {})
# Get form fields if template is loaded
fields = []
if pdf_form_filler and form_id in pdf_form_filler.form_templates:
fields = pdf_form_filler.get_form_fields(form_id)
forms_info.append(
{
"form_id": form_id,
"name": form_config.get("name", form_id),
"template_available": form_id
in (pdf_form_filler.form_templates if pdf_form_filler else {}),
"field_count": len(fields),
"fields": fields[:10], # Limit to first 10 fields for overview
}
)
return {"supported_forms": forms_info, "total_forms": len(forms_info)}
except Exception as e:
logger.error("Failed to list forms", error=str(e))
raise HTTPException(status_code=500, detail="Failed to list forms")
async def _handle_calculation_ready(topic: str, payload: EventPayload) -> None:
"""Handle calculation completion events for auto-form filling"""
try:
data = payload.data
calculation_id = data.get("calculation_id")
schedule = data.get("schedule")
tenant_id = data.get("tenant_id")
if not calculation_id or not schedule or not tenant_id:
logger.warning("Invalid calculation ready event", data=data)
return
logger.info(
"Auto-filling form from calculation",
calculation_id=calculation_id,
schedule=schedule,
)
# Get form boxes from event data
form_boxes = data.get("form_boxes", {})
# Convert to field values
field_values = {}
for box_id, box_data in form_boxes.items():
field_values[f"box_{box_id}"] = box_data.get("value")
await _fill_form_async(
form_id=schedule,
field_values=field_values,
tenant_id=tenant_id,
filling_id=str(ulid.new()),
actor=payload.actor,
calculation_id=calculation_id,
)
except Exception as e:
logger.error("Failed to handle calculation ready event", error=str(e))
async def _fill_form_async(
form_id: str,
field_values: dict[str, Any],
tenant_id: str,
filling_id: str,
actor: str,
calculation_id: str | None = None,
) -> None:
"""Fill form asynchronously"""
with tracer.start_as_current_span("fill_form_async") as span:
span.set_attribute("form_id", form_id)
span.set_attribute("filling_id", filling_id)
span.set_attribute("tenant_id", tenant_id)
try:
# Fill the form
filled_pdf = pdf_form_filler.fill_form(form_id, field_values) # fmt: skip # pyright: ignore[reportOptionalMemberAccess]
if not filled_pdf:
# pylint: disable-next=broad-exception-raised
raise Exception("Form filling failed")
# Store filled form
object_key = f"tenants/{tenant_id}/filled/{filling_id}.pdf"
success = await storage_client.put_object( # fmt: skip # pyright: ignore[reportOptionalMemberAccess]
bucket_name=settings.output_bucket,
object_name=object_key,
data=BytesIO(filled_pdf),
length=len(filled_pdf),
content_type="application/pdf",
metadata={
"form_id": form_id,
"filling_id": filling_id,
"tenant_id": tenant_id,
"calculation_id": calculation_id or "",
"filled_at": datetime.utcnow().isoformat(),
},
)
if not success:
# pylint: disable-next=broad-exception-raised
raise Exception("Failed to store filled form")
# Update metrics
metrics.counter("forms_filled_total").labels(
tenant_id=tenant_id, form_id=form_id
).inc()
# Publish completion event
event_payload = EventPayload(
data={
"filling_id": filling_id,
"form_id": form_id,
"tenant_id": tenant_id,
"calculation_id": calculation_id,
"s3_url": f"s3://{settings.output_bucket}/{object_key}",
"field_count": len(field_values),
},
actor=actor,
tenant_id=tenant_id,
)
await event_bus.publish(EventTopics.FORM_FILLED, event_payload) # fmt: skip # pyright: ignore[reportOptionalMemberAccess]
logger.info(
"Form filling completed", filling_id=filling_id, form_id=form_id
)
except Exception as e:
logger.error("Form filling failed", filling_id=filling_id, error=str(e))
# Update error metrics
metrics.counter("form_filling_errors_total").labels(
tenant_id=tenant_id, form_id=form_id, error_type=type(e).__name__
).inc()
async def _create_evidence_pack_async(
taxpayer_id: str,
tax_year: str,
scope: str,
evidence_items: list[dict[str, Any]],
tenant_id: str,
pack_id: str,
actor: str,
) -> None:
"""Create evidence pack asynchronously"""
with tracer.start_as_current_span("create_evidence_pack_async") as span:
span.set_attribute("pack_id", pack_id)
span.set_attribute("taxpayer_id", taxpayer_id)
span.set_attribute("scope", scope)
try:
# Create evidence pack
pack_result = await evidence_pack_generator.create_evidence_pack( # fmt: skip # pyright: ignore[reportOptionalMemberAccess]
taxpayer_id=taxpayer_id,
tax_year=tax_year,
scope=scope,
evidence_items=evidence_items,
)
# Update metrics
metrics.counter("evidence_packs_created_total").labels(
tenant_id=tenant_id, scope=scope
).inc()
logger.info(
"Evidence pack created",
pack_id=pack_id,
pack_size=pack_result["pack_size"],
evidence_count=pack_result["evidence_count"],
)
except Exception as e:
logger.error("Evidence pack creation failed", pack_id=pack_id, error=str(e))
@app.exception_handler(HTTPException)
async def http_exception_handler(request: Request, exc: HTTPException) -> JSONResponse:
"""Handle HTTP exceptions with RFC7807 format"""
return JSONResponse(
status_code=exc.status_code,
content=ErrorResponse(
type=f"https://httpstatuses.com/{exc.status_code}",
title=exc.detail,
status=exc.status_code,
detail=exc.detail,
instance=str(request.url),
trace_id="",
).model_dump(),
)
if __name__ == "__main__":
import uvicorn
uvicorn.run("main:app", host="0.0.0.0", port=8009, reload=True, log_config=None)