Initial commit
Some checks failed
CI/CD Pipeline / Code Quality & Linting (push) Has been cancelled
CI/CD Pipeline / Policy Validation (push) Has been cancelled
CI/CD Pipeline / Test Suite (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-firm-connectors) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-forms) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-hmrc) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ingestion) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-normalize-map) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ocr) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-indexer) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-reason) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rpa) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (ui-review) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (ui-review) (push) Has been cancelled
CI/CD Pipeline / Generate SBOM (push) Has been cancelled
CI/CD Pipeline / Deploy to Staging (push) Has been cancelled
CI/CD Pipeline / Deploy to Production (push) Has been cancelled
CI/CD Pipeline / Notifications (push) Has been cancelled
Some checks failed
CI/CD Pipeline / Code Quality & Linting (push) Has been cancelled
CI/CD Pipeline / Policy Validation (push) Has been cancelled
CI/CD Pipeline / Test Suite (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-firm-connectors) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-forms) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-hmrc) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ingestion) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-normalize-map) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ocr) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-indexer) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-reason) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rpa) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (ui-review) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (ui-review) (push) Has been cancelled
CI/CD Pipeline / Generate SBOM (push) Has been cancelled
CI/CD Pipeline / Deploy to Staging (push) Has been cancelled
CI/CD Pipeline / Deploy to Production (push) Has been cancelled
CI/CD Pipeline / Notifications (push) Has been cancelled
This commit is contained in:
10
libs/forms/__init__.py
Normal file
10
libs/forms/__init__.py
Normal file
@@ -0,0 +1,10 @@
|
||||
"""PDF form filling and evidence pack generation."""
|
||||
|
||||
from .evidence_pack import UK_TAX_FORMS, EvidencePackGenerator
|
||||
from .pdf_filler import PDFFormFiller
|
||||
|
||||
__all__ = [
|
||||
"PDFFormFiller",
|
||||
"EvidencePackGenerator",
|
||||
"UK_TAX_FORMS",
|
||||
]
|
||||
185
libs/forms/evidence_pack.py
Normal file
185
libs/forms/evidence_pack.py
Normal file
@@ -0,0 +1,185 @@
|
||||
"""Evidence pack generation with manifests and signatures."""
|
||||
|
||||
import io
|
||||
from typing import Any
|
||||
|
||||
import structlog
|
||||
|
||||
logger = structlog.get_logger()
|
||||
|
||||
|
||||
class EvidencePackGenerator: # pylint: disable=too-few-public-methods
|
||||
"""Generate evidence packs with manifests and signatures"""
|
||||
|
||||
def __init__(self, storage_client: Any) -> None:
|
||||
self.storage = storage_client
|
||||
|
||||
async def create_evidence_pack( # pylint: disable=too-many-locals
|
||||
self,
|
||||
taxpayer_id: str,
|
||||
tax_year: str,
|
||||
scope: str,
|
||||
evidence_items: list[dict[str, Any]],
|
||||
) -> dict[str, Any]:
|
||||
"""Create evidence pack with manifest and signatures"""
|
||||
# pylint: disable=import-outside-toplevel
|
||||
import hashlib
|
||||
import json
|
||||
import zipfile
|
||||
from datetime import datetime
|
||||
|
||||
try:
|
||||
# Create ZIP buffer
|
||||
zip_buffer = io.BytesIO()
|
||||
|
||||
with zipfile.ZipFile(zip_buffer, "w", zipfile.ZIP_DEFLATED) as zip_file:
|
||||
manifest: dict[str, Any] = {
|
||||
"taxpayer_id": taxpayer_id,
|
||||
"tax_year": tax_year,
|
||||
"scope": scope,
|
||||
"created_at": datetime.utcnow().isoformat(),
|
||||
"evidence_items": [],
|
||||
"signatures": {},
|
||||
}
|
||||
|
||||
# Add evidence files to ZIP
|
||||
for item in evidence_items:
|
||||
doc_id = item["doc_id"]
|
||||
page = item.get("page")
|
||||
bbox = item.get("bbox")
|
||||
text_hash = item.get("text_hash")
|
||||
|
||||
# Get document content
|
||||
doc_content = await self.storage.get_object(
|
||||
bucket_name="raw-documents",
|
||||
object_name=f"tenants/{taxpayer_id}/raw/{doc_id}.pdf",
|
||||
)
|
||||
|
||||
if doc_content:
|
||||
# Add to ZIP
|
||||
zip_filename = f"documents/{doc_id}.pdf"
|
||||
zip_file.writestr(zip_filename, doc_content)
|
||||
|
||||
# Calculate file hash
|
||||
file_hash = hashlib.sha256(doc_content).hexdigest()
|
||||
|
||||
# Add to manifest
|
||||
manifest["evidence_items"].append(
|
||||
{
|
||||
"doc_id": doc_id,
|
||||
"filename": zip_filename,
|
||||
"page": page,
|
||||
"bbox": bbox,
|
||||
"text_hash": text_hash,
|
||||
"file_hash": file_hash,
|
||||
"file_size": len(doc_content),
|
||||
}
|
||||
)
|
||||
|
||||
# Sign manifest
|
||||
manifest_json = json.dumps(manifest, indent=2, sort_keys=True)
|
||||
manifest_hash = hashlib.sha256(manifest_json.encode()).hexdigest()
|
||||
|
||||
manifest["signatures"]["manifest_hash"] = manifest_hash
|
||||
manifest["signatures"]["algorithm"] = "SHA-256"
|
||||
|
||||
# Add manifest to ZIP
|
||||
zip_file.writestr("manifest.json", json.dumps(manifest, indent=2))
|
||||
|
||||
# Get ZIP content
|
||||
zip_content = zip_buffer.getvalue()
|
||||
|
||||
# Store evidence pack
|
||||
pack_filename = f"evidence_pack_{taxpayer_id}_{tax_year}_{scope}.zip"
|
||||
pack_key = f"tenants/{taxpayer_id}/evidence_packs/{pack_filename}"
|
||||
|
||||
success = await self.storage.put_object(
|
||||
bucket_name="evidence-packs",
|
||||
object_name=pack_key,
|
||||
data=io.BytesIO(zip_content),
|
||||
length=len(zip_content),
|
||||
content_type="application/zip",
|
||||
)
|
||||
|
||||
if success:
|
||||
return {
|
||||
"pack_filename": pack_filename,
|
||||
"pack_key": pack_key,
|
||||
"pack_size": len(zip_content),
|
||||
"evidence_count": len(evidence_items),
|
||||
"manifest_hash": manifest_hash,
|
||||
"s3_url": f"s3://evidence-packs/{pack_key}",
|
||||
}
|
||||
raise RuntimeError("Failed to store evidence pack")
|
||||
|
||||
except Exception as e: # pylint: disable=broad-exception-caught
|
||||
logger.error("Failed to create evidence pack", error=str(e))
|
||||
raise
|
||||
|
||||
|
||||
# Form configuration for UK tax forms
|
||||
UK_TAX_FORMS = {
|
||||
"SA100": {
|
||||
"name": "Self Assessment Tax Return",
|
||||
"template_path": "forms/templates/SA100.pdf",
|
||||
"boxes": {
|
||||
"1": {"description": "Your name", "type": "text"},
|
||||
"2": {"description": "Your address", "type": "text"},
|
||||
"3": {"description": "Your UTR", "type": "text"},
|
||||
"4": {"description": "Your NI number", "type": "text"},
|
||||
},
|
||||
},
|
||||
"SA103": {
|
||||
"name": "Self-employment (full)",
|
||||
"template_path": "forms/templates/SA103.pdf",
|
||||
"boxes": {
|
||||
"1": {"description": "Business name", "type": "text"},
|
||||
"2": {"description": "Business description", "type": "text"},
|
||||
"3": {"description": "Accounting period start", "type": "date"},
|
||||
"4": {"description": "Accounting period end", "type": "date"},
|
||||
"20": {"description": "Total turnover", "type": "currency"},
|
||||
"31": {
|
||||
"description": "Total allowable business expenses",
|
||||
"type": "currency",
|
||||
},
|
||||
"32": {"description": "Net profit", "type": "currency"},
|
||||
"33": {"description": "Balancing charges", "type": "currency"},
|
||||
"34": {"description": "Goods/services for own use", "type": "currency"},
|
||||
"35": {"description": "Total taxable profits", "type": "currency"},
|
||||
},
|
||||
},
|
||||
"SA105": {
|
||||
"name": "Property income",
|
||||
"template_path": "forms/templates/SA105.pdf",
|
||||
"boxes": {
|
||||
"20": {"description": "Total rents and other income", "type": "currency"},
|
||||
"29": {
|
||||
"description": "Premiums for the grant of a lease",
|
||||
"type": "currency",
|
||||
},
|
||||
"31": {
|
||||
"description": "Rent, rates, insurance, ground rents etc",
|
||||
"type": "currency",
|
||||
},
|
||||
"32": {"description": "Property management", "type": "currency"},
|
||||
"33": {
|
||||
"description": "Services provided, including wages",
|
||||
"type": "currency",
|
||||
},
|
||||
"34": {
|
||||
"description": "Repairs, maintenance and renewals",
|
||||
"type": "currency",
|
||||
},
|
||||
"35": {
|
||||
"description": "Finance costs, including interest",
|
||||
"type": "currency",
|
||||
},
|
||||
"36": {"description": "Professional fees", "type": "currency"},
|
||||
"37": {"description": "Costs of services provided", "type": "currency"},
|
||||
"38": {
|
||||
"description": "Other allowable property expenses",
|
||||
"type": "currency",
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
246
libs/forms/pdf_filler.py
Normal file
246
libs/forms/pdf_filler.py
Normal file
@@ -0,0 +1,246 @@
|
||||
"""PDF form filling using pdfrw with reportlab fallback."""
|
||||
|
||||
import io
|
||||
from typing import Any
|
||||
|
||||
import structlog
|
||||
|
||||
logger = structlog.get_logger()
|
||||
|
||||
|
||||
class PDFFormFiller:
|
||||
"""PDF form filling using pdfrw with reportlab fallback"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
self.form_templates: dict[str, Any] = {}
|
||||
|
||||
def load_template(self, form_id: str, template_path: str) -> bool:
|
||||
"""Load PDF form template"""
|
||||
try:
|
||||
# pylint: disable=import-outside-toplevel
|
||||
from pdfrw import PdfReader # type: ignore
|
||||
|
||||
template = PdfReader(template_path)
|
||||
if template is None:
|
||||
logger.error(
|
||||
"Failed to load PDF template", form_id=form_id, path=template_path
|
||||
)
|
||||
return False
|
||||
|
||||
self.form_templates[form_id] = {"template": template, "path": template_path}
|
||||
|
||||
logger.info("Loaded PDF template", form_id=form_id, path=template_path)
|
||||
return True
|
||||
|
||||
except ImportError:
|
||||
logger.error("pdfrw not available for PDF form filling")
|
||||
return False
|
||||
except Exception as e: # pylint: disable=broad-exception-caught
|
||||
logger.error("Failed to load PDF template", form_id=form_id, error=str(e))
|
||||
return False
|
||||
|
||||
def fill_form(
|
||||
self,
|
||||
form_id: str,
|
||||
field_values: dict[str, str | int | float | bool],
|
||||
output_path: str | None = None,
|
||||
) -> bytes | None:
|
||||
"""Fill PDF form with values"""
|
||||
|
||||
if form_id not in self.form_templates:
|
||||
logger.error("Form template not loaded", form_id=form_id)
|
||||
return None
|
||||
|
||||
try:
|
||||
return self._fill_with_pdfrw(form_id, field_values, output_path)
|
||||
except Exception as e: # pylint: disable=broad-exception-caught
|
||||
logger.warning(
|
||||
"pdfrw filling failed, trying reportlab overlay", error=str(e)
|
||||
)
|
||||
return self._fill_with_overlay(form_id, field_values, output_path)
|
||||
|
||||
def _fill_with_pdfrw(
|
||||
self,
|
||||
form_id: str,
|
||||
field_values: dict[str, Any],
|
||||
output_path: str | None = None,
|
||||
) -> bytes | None:
|
||||
"""Fill form using pdfrw"""
|
||||
# pylint: disable=import-outside-toplevel
|
||||
from pdfrw import PdfDict, PdfReader, PdfWriter
|
||||
|
||||
template_info = self.form_templates[form_id]
|
||||
template = PdfReader(template_info["path"])
|
||||
|
||||
# Get form fields
|
||||
if template.Root.AcroForm is None: # pyright: ignore[reportOptionalMemberAccess, reportAttributeAccessIssue] # fmt: skip
|
||||
logger.warning("PDF has no AcroForm fields", form_id=form_id)
|
||||
return self._fill_with_overlay(form_id, field_values, output_path)
|
||||
|
||||
# Fill form fields
|
||||
for field in template.Root.AcroForm.Fields: # pyright: ignore[reportOptionalMemberAccess, reportAttributeAccessIssue] # fmt: skip
|
||||
field_name = field.T
|
||||
if field_name and field_name[1:-1] in field_values: # Remove parentheses
|
||||
field_value = field_values[field_name[1:-1]]
|
||||
|
||||
# Set field value
|
||||
if isinstance(field_value, bool):
|
||||
# Checkbox field
|
||||
if field_value:
|
||||
field.V = PdfDict.Yes # fmt: skip # pyright: ignore[reportAttributeAccessIssue]
|
||||
field.AS = PdfDict.Yes # fmt: skip # pyright: ignore[reportAttributeAccessIssue]
|
||||
else:
|
||||
field.V = PdfDict.Off # fmt: skip # pyright: ignore[reportAttributeAccessIssue]
|
||||
field.AS = PdfDict.Off # fmt: skip # pyright: ignore[reportAttributeAccessIssue]
|
||||
else:
|
||||
# Text field
|
||||
field.V = str(field_value)
|
||||
|
||||
# Make field read-only
|
||||
field.Ff = 1 # Read-only flag
|
||||
|
||||
# Flatten form (make fields non-editable)
|
||||
if template.Root.AcroForm: # pyright: ignore[reportOptionalMemberAccess, reportAttributeAccessIssue] # fmt: skip
|
||||
template.Root.AcroForm.NeedAppearances = True # pyright: ignore[reportOptionalMemberAccess, reportAttributeAccessIssue] # fmt: skip
|
||||
|
||||
# Write to output
|
||||
if output_path:
|
||||
writer = PdfWriter(output_path)
|
||||
writer.write(template)
|
||||
with open(output_path, "rb") as f:
|
||||
return f.read()
|
||||
else:
|
||||
# Write to bytes
|
||||
output_buffer = io.BytesIO()
|
||||
writer = PdfWriter(output_buffer)
|
||||
writer.write(template)
|
||||
return output_buffer.getvalue()
|
||||
|
||||
def _fill_with_overlay( # pylint: disable=too-many-locals
|
||||
self,
|
||||
form_id: str,
|
||||
field_values: dict[str, Any],
|
||||
output_path: str | None = None,
|
||||
) -> bytes | None:
|
||||
"""Fill form using reportlab overlay method"""
|
||||
try:
|
||||
# pylint: disable=import-outside-toplevel
|
||||
from PyPDF2 import PdfReader, PdfWriter
|
||||
from reportlab.lib.pagesizes import A4
|
||||
from reportlab.pdfgen import canvas
|
||||
|
||||
template_info = self.form_templates[form_id]
|
||||
|
||||
# Read original PDF
|
||||
original_pdf = PdfReader(template_info["path"])
|
||||
|
||||
# Create overlay with form data
|
||||
overlay_buffer = io.BytesIO()
|
||||
overlay_canvas = canvas.Canvas(overlay_buffer, pagesize=A4)
|
||||
|
||||
# Get field positions (this would be configured per form)
|
||||
field_positions = self._get_field_positions(form_id)
|
||||
|
||||
# Add text to overlay
|
||||
for field_name, value in field_values.items():
|
||||
if field_name in field_positions:
|
||||
pos = field_positions[field_name]
|
||||
overlay_canvas.drawString(pos["x"], pos["y"], str(value))
|
||||
|
||||
overlay_canvas.save()
|
||||
overlay_buffer.seek(0)
|
||||
|
||||
# Read overlay PDF
|
||||
overlay_pdf = PdfReader(overlay_buffer)
|
||||
|
||||
# Merge original and overlay
|
||||
writer = PdfWriter()
|
||||
for page_num, _ in enumerate(original_pdf.pages):
|
||||
original_page = original_pdf.pages[page_num]
|
||||
|
||||
if page_num < len(overlay_pdf.pages):
|
||||
overlay_page = overlay_pdf.pages[page_num]
|
||||
original_page.merge_page(overlay_page)
|
||||
|
||||
writer.add_page(original_page)
|
||||
|
||||
# Write result
|
||||
if output_path:
|
||||
with open(output_path, "wb") as output_file:
|
||||
writer.write(output_file)
|
||||
with open(output_path, "rb") as f:
|
||||
return f.read()
|
||||
else:
|
||||
output_buffer = io.BytesIO()
|
||||
writer.write(output_buffer)
|
||||
return output_buffer.getvalue()
|
||||
|
||||
except ImportError as e:
|
||||
logger.error(
|
||||
"Required libraries not available for overlay method", error=str(e)
|
||||
)
|
||||
return None
|
||||
except Exception as e: # pylint: disable=broad-exception-caught
|
||||
logger.error("Overlay filling failed", form_id=form_id, error=str(e))
|
||||
return None
|
||||
|
||||
def _get_field_positions(self, form_id: str) -> dict[str, dict[str, float]]:
|
||||
"""Get field positions for overlay method"""
|
||||
# This would be configured per form type
|
||||
# For now, return sample positions for SA103
|
||||
if form_id == "SA103":
|
||||
return {
|
||||
"box_1": {"x": 100, "y": 750}, # Business name
|
||||
"box_2": {"x": 100, "y": 720}, # Business description
|
||||
"box_20": {"x": 400, "y": 600}, # Total turnover
|
||||
"box_31": {"x": 400, "y": 570}, # Total expenses
|
||||
"box_32": {"x": 400, "y": 540}, # Net profit
|
||||
}
|
||||
return {}
|
||||
|
||||
def get_form_fields(self, form_id: str) -> list[dict[str, Any]]:
|
||||
"""Get list of available form fields"""
|
||||
if form_id not in self.form_templates:
|
||||
return []
|
||||
|
||||
try:
|
||||
# pylint: disable=import-outside-toplevel
|
||||
from pdfrw import PdfReader
|
||||
|
||||
template_info = self.form_templates[form_id]
|
||||
template = PdfReader(template_info["path"])
|
||||
|
||||
if template.Root.AcroForm is None: # fmt: skip # pyright: ignore[reportOptionalMemberAccess, reportAttributeAccessIssue]
|
||||
return []
|
||||
|
||||
fields = []
|
||||
for field in template.Root.AcroForm.Fields: # fmt: skip # pyright: ignore[reportOptionalMemberAccess, reportAttributeAccessIssue]
|
||||
field_info = {
|
||||
"name": field.T[1:-1] if field.T else None, # Remove parentheses
|
||||
"type": self._get_field_type(field),
|
||||
"required": bool(field.Ff and int(field.Ff) & 2), # Required flag
|
||||
"readonly": bool(field.Ff and int(field.Ff) & 1), # Read-only flag
|
||||
}
|
||||
|
||||
if field.V:
|
||||
field_info["default_value"] = str(field.V)
|
||||
|
||||
fields.append(field_info)
|
||||
|
||||
return fields
|
||||
|
||||
except Exception as e: # pylint: disable=broad-exception-caught
|
||||
logger.error("Failed to get form fields", form_id=form_id, error=str(e))
|
||||
return []
|
||||
|
||||
def _get_field_type(self, field: Any) -> str:
|
||||
"""Determine field type from PDF field"""
|
||||
if hasattr(field, "FT"):
|
||||
field_type = str(field.FT)
|
||||
if "Tx" in field_type:
|
||||
return "text"
|
||||
if "Btn" in field_type:
|
||||
return "checkbox" if field.Ff and int(field.Ff) & 32768 else "button"
|
||||
if "Ch" in field_type:
|
||||
return "choice"
|
||||
return "unknown"
|
||||
Reference in New Issue
Block a user