Initial commit

2025-10-11 08:41:36 +01:00
commit b324ff09ef
276 changed files with 55220 additions and 0 deletions
--- a/libs/forms/init.py
+++ b/libs/forms/init.py
@@ -0,0 +1,10 @@
+"""PDF form filling and evidence pack generation."""
+
+from .evidence_pack import UK_TAX_FORMS, EvidencePackGenerator
+from .pdf_filler import PDFFormFiller
+
+__all__ = [
+    "PDFFormFiller",
+    "EvidencePackGenerator",
+    "UK_TAX_FORMS",
+]
--- a/libs/forms/evidence_pack.py
+++ b/libs/forms/evidence_pack.py
@@ -0,0 +1,185 @@
+"""Evidence pack generation with manifests and signatures."""
+
+import io
+from typing import Any
+
+import structlog
+
+logger = structlog.get_logger()
+
+
+class EvidencePackGenerator:  # pylint: disable=too-few-public-methods
+    """Generate evidence packs with manifests and signatures"""
+
+    def __init__(self, storage_client: Any) -> None:
+        self.storage = storage_client
+
+    async def create_evidence_pack(  # pylint: disable=too-many-locals
+        self,
+        taxpayer_id: str,
+        tax_year: str,
+        scope: str,
+        evidence_items: list[dict[str, Any]],
+    ) -> dict[str, Any]:
+        """Create evidence pack with manifest and signatures"""
+        # pylint: disable=import-outside-toplevel
+        import hashlib
+        import json
+        import zipfile
+        from datetime import datetime
+
+        try:
+            # Create ZIP buffer
+            zip_buffer = io.BytesIO()
+
+            with zipfile.ZipFile(zip_buffer, "w", zipfile.ZIP_DEFLATED) as zip_file:
+                manifest: dict[str, Any] = {
+                    "taxpayer_id": taxpayer_id,
+                    "tax_year": tax_year,
+                    "scope": scope,
+                    "created_at": datetime.utcnow().isoformat(),
+                    "evidence_items": [],
+                    "signatures": {},
+                }
+
+                # Add evidence files to ZIP
+                for item in evidence_items:
+                    doc_id = item["doc_id"]
+                    page = item.get("page")
+                    bbox = item.get("bbox")
+                    text_hash = item.get("text_hash")
+
+                    # Get document content
+                    doc_content = await self.storage.get_object(
+                        bucket_name="raw-documents",
+                        object_name=f"tenants/{taxpayer_id}/raw/{doc_id}.pdf",
+                    )
+
+                    if doc_content:
+                        # Add to ZIP
+                        zip_filename = f"documents/{doc_id}.pdf"
+                        zip_file.writestr(zip_filename, doc_content)
+
+                        # Calculate file hash
+                        file_hash = hashlib.sha256(doc_content).hexdigest()
+
+                        # Add to manifest
+                        manifest["evidence_items"].append(
+                            {
+                                "doc_id": doc_id,
+                                "filename": zip_filename,
+                                "page": page,
+                                "bbox": bbox,
+                                "text_hash": text_hash,
+                                "file_hash": file_hash,
+                                "file_size": len(doc_content),
+                            }
+                        )
+
+                # Sign manifest
+                manifest_json = json.dumps(manifest, indent=2, sort_keys=True)
+                manifest_hash = hashlib.sha256(manifest_json.encode()).hexdigest()
+
+                manifest["signatures"]["manifest_hash"] = manifest_hash
+                manifest["signatures"]["algorithm"] = "SHA-256"
+
+                # Add manifest to ZIP
+                zip_file.writestr("manifest.json", json.dumps(manifest, indent=2))
+
+            # Get ZIP content
+            zip_content = zip_buffer.getvalue()
+
+            # Store evidence pack
+            pack_filename = f"evidence_pack_{taxpayer_id}_{tax_year}_{scope}.zip"
+            pack_key = f"tenants/{taxpayer_id}/evidence_packs/{pack_filename}"
+
+            success = await self.storage.put_object(
+                bucket_name="evidence-packs",
+                object_name=pack_key,
+                data=io.BytesIO(zip_content),
+                length=len(zip_content),
+                content_type="application/zip",
+            )
+
+            if success:
+                return {
+                    "pack_filename": pack_filename,
+                    "pack_key": pack_key,
+                    "pack_size": len(zip_content),
+                    "evidence_count": len(evidence_items),
+                    "manifest_hash": manifest_hash,
+                    "s3_url": f"s3://evidence-packs/{pack_key}",
+                }
+            raise RuntimeError("Failed to store evidence pack")
+
+        except Exception as e:  # pylint: disable=broad-exception-caught
+            logger.error("Failed to create evidence pack", error=str(e))
+            raise
+
+
+# Form configuration for UK tax forms
+UK_TAX_FORMS = {
+    "SA100": {
+        "name": "Self Assessment Tax Return",
+        "template_path": "forms/templates/SA100.pdf",
+        "boxes": {
+            "1": {"description": "Your name", "type": "text"},
+            "2": {"description": "Your address", "type": "text"},
+            "3": {"description": "Your UTR", "type": "text"},
+            "4": {"description": "Your NI number", "type": "text"},
+        },
+    },
+    "SA103": {
+        "name": "Self-employment (full)",
+        "template_path": "forms/templates/SA103.pdf",
+        "boxes": {
+            "1": {"description": "Business name", "type": "text"},
+            "2": {"description": "Business description", "type": "text"},
+            "3": {"description": "Accounting period start", "type": "date"},
+            "4": {"description": "Accounting period end", "type": "date"},
+            "20": {"description": "Total turnover", "type": "currency"},
+            "31": {
+                "description": "Total allowable business expenses",
+                "type": "currency",
+            },
+            "32": {"description": "Net profit", "type": "currency"},
+            "33": {"description": "Balancing charges", "type": "currency"},
+            "34": {"description": "Goods/services for own use", "type": "currency"},
+            "35": {"description": "Total taxable profits", "type": "currency"},
+        },
+    },
+    "SA105": {
+        "name": "Property income",
+        "template_path": "forms/templates/SA105.pdf",
+        "boxes": {
+            "20": {"description": "Total rents and other income", "type": "currency"},
+            "29": {
+                "description": "Premiums for the grant of a lease",
+                "type": "currency",
+            },
+            "31": {
+                "description": "Rent, rates, insurance, ground rents etc",
+                "type": "currency",
+            },
+            "32": {"description": "Property management", "type": "currency"},
+            "33": {
+                "description": "Services provided, including wages",
+                "type": "currency",
+            },
+            "34": {
+                "description": "Repairs, maintenance and renewals",
+                "type": "currency",
+            },
+            "35": {
+                "description": "Finance costs, including interest",
+                "type": "currency",
+            },
+            "36": {"description": "Professional fees", "type": "currency"},
+            "37": {"description": "Costs of services provided", "type": "currency"},
+            "38": {
+                "description": "Other allowable property expenses",
+                "type": "currency",
+            },
+        },
+    },
+}
--- a/libs/forms/pdf_filler.py
+++ b/libs/forms/pdf_filler.py
@@ -0,0 +1,246 @@
+"""PDF form filling using pdfrw with reportlab fallback."""
+
+import io
+from typing import Any
+
+import structlog
+
+logger = structlog.get_logger()
+
+
+class PDFFormFiller:
+    """PDF form filling using pdfrw with reportlab fallback"""
+
+    def __init__(self) -> None:
+        self.form_templates: dict[str, Any] = {}
+
+    def load_template(self, form_id: str, template_path: str) -> bool:
+        """Load PDF form template"""
+        try:
+            # pylint: disable=import-outside-toplevel
+            from pdfrw import PdfReader  # type: ignore
+
+            template = PdfReader(template_path)
+            if template is None:
+                logger.error(
+                    "Failed to load PDF template", form_id=form_id, path=template_path
+                )
+                return False
+
+            self.form_templates[form_id] = {"template": template, "path": template_path}
+
+            logger.info("Loaded PDF template", form_id=form_id, path=template_path)
+            return True
+
+        except ImportError:
+            logger.error("pdfrw not available for PDF form filling")
+            return False
+        except Exception as e:  # pylint: disable=broad-exception-caught
+            logger.error("Failed to load PDF template", form_id=form_id, error=str(e))
+            return False
+
+    def fill_form(
+        self,
+        form_id: str,
+        field_values: dict[str, str | int | float | bool],
+        output_path: str | None = None,
+    ) -> bytes | None:
+        """Fill PDF form with values"""
+
+        if form_id not in self.form_templates:
+            logger.error("Form template not loaded", form_id=form_id)
+            return None
+
+        try:
+            return self._fill_with_pdfrw(form_id, field_values, output_path)
+        except Exception as e:  # pylint: disable=broad-exception-caught
+            logger.warning(
+                "pdfrw filling failed, trying reportlab overlay", error=str(e)
+            )
+            return self._fill_with_overlay(form_id, field_values, output_path)
+
+    def _fill_with_pdfrw(
+        self,
+        form_id: str,
+        field_values: dict[str, Any],
+        output_path: str | None = None,
+    ) -> bytes | None:
+        """Fill form using pdfrw"""
+        # pylint: disable=import-outside-toplevel
+        from pdfrw import PdfDict, PdfReader, PdfWriter
+
+        template_info = self.form_templates[form_id]
+        template = PdfReader(template_info["path"])
+
+        # Get form fields
+        if template.Root.AcroForm is None:  # pyright: ignore[reportOptionalMemberAccess, reportAttributeAccessIssue] # fmt: skip
+            logger.warning("PDF has no AcroForm fields", form_id=form_id)
+            return self._fill_with_overlay(form_id, field_values, output_path)
+
+        # Fill form fields
+        for field in template.Root.AcroForm.Fields:  # pyright: ignore[reportOptionalMemberAccess, reportAttributeAccessIssue] # fmt: skip
+            field_name = field.T
+            if field_name and field_name[1:-1] in field_values:  # Remove parentheses
+                field_value = field_values[field_name[1:-1]]
+
+                # Set field value
+                if isinstance(field_value, bool):
+                    # Checkbox field
+                    if field_value:
+                        field.V = PdfDict.Yes  # fmt: skip # pyright: ignore[reportAttributeAccessIssue]
+                        field.AS = PdfDict.Yes  # fmt: skip # pyright: ignore[reportAttributeAccessIssue]
+                    else:
+                        field.V = PdfDict.Off  # fmt: skip # pyright: ignore[reportAttributeAccessIssue]
+                        field.AS = PdfDict.Off  # fmt: skip # pyright: ignore[reportAttributeAccessIssue]
+                else:
+                    # Text field
+                    field.V = str(field_value)
+
+                # Make field read-only
+                field.Ff = 1  # Read-only flag
+
+        # Flatten form (make fields non-editable)
+        if template.Root.AcroForm:  # pyright: ignore[reportOptionalMemberAccess, reportAttributeAccessIssue] # fmt: skip
+            template.Root.AcroForm.NeedAppearances = True  # pyright: ignore[reportOptionalMemberAccess, reportAttributeAccessIssue] # fmt: skip
+
+        # Write to output
+        if output_path:
+            writer = PdfWriter(output_path)
+            writer.write(template)
+            with open(output_path, "rb") as f:
+                return f.read()
+        else:
+            # Write to bytes
+            output_buffer = io.BytesIO()
+            writer = PdfWriter(output_buffer)
+            writer.write(template)
+            return output_buffer.getvalue()
+
+    def _fill_with_overlay(  # pylint: disable=too-many-locals
+        self,
+        form_id: str,
+        field_values: dict[str, Any],
+        output_path: str | None = None,
+    ) -> bytes | None:
+        """Fill form using reportlab overlay method"""
+        try:
+            # pylint: disable=import-outside-toplevel
+            from PyPDF2 import PdfReader, PdfWriter
+            from reportlab.lib.pagesizes import A4
+            from reportlab.pdfgen import canvas
+
+            template_info = self.form_templates[form_id]
+
+            # Read original PDF
+            original_pdf = PdfReader(template_info["path"])
+
+            # Create overlay with form data
+            overlay_buffer = io.BytesIO()
+            overlay_canvas = canvas.Canvas(overlay_buffer, pagesize=A4)
+
+            # Get field positions (this would be configured per form)
+            field_positions = self._get_field_positions(form_id)
+
+            # Add text to overlay
+            for field_name, value in field_values.items():
+                if field_name in field_positions:
+                    pos = field_positions[field_name]
+                    overlay_canvas.drawString(pos["x"], pos["y"], str(value))
+
+            overlay_canvas.save()
+            overlay_buffer.seek(0)
+
+            # Read overlay PDF
+            overlay_pdf = PdfReader(overlay_buffer)
+
+            # Merge original and overlay
+            writer = PdfWriter()
+            for page_num, _ in enumerate(original_pdf.pages):
+                original_page = original_pdf.pages[page_num]
+
+                if page_num < len(overlay_pdf.pages):
+                    overlay_page = overlay_pdf.pages[page_num]
+                    original_page.merge_page(overlay_page)
+
+                writer.add_page(original_page)
+
+            # Write result
+            if output_path:
+                with open(output_path, "wb") as output_file:
+                    writer.write(output_file)
+                with open(output_path, "rb") as f:
+                    return f.read()
+            else:
+                output_buffer = io.BytesIO()
+                writer.write(output_buffer)
+                return output_buffer.getvalue()
+
+        except ImportError as e:
+            logger.error(
+                "Required libraries not available for overlay method", error=str(e)
+            )
+            return None
+        except Exception as e:  # pylint: disable=broad-exception-caught
+            logger.error("Overlay filling failed", form_id=form_id, error=str(e))
+            return None
+
+    def _get_field_positions(self, form_id: str) -> dict[str, dict[str, float]]:
+        """Get field positions for overlay method"""
+        # This would be configured per form type
+        # For now, return sample positions for SA103
+        if form_id == "SA103":
+            return {
+                "box_1": {"x": 100, "y": 750},  # Business name
+                "box_2": {"x": 100, "y": 720},  # Business description
+                "box_20": {"x": 400, "y": 600},  # Total turnover
+                "box_31": {"x": 400, "y": 570},  # Total expenses
+                "box_32": {"x": 400, "y": 540},  # Net profit
+            }
+        return {}
+
+    def get_form_fields(self, form_id: str) -> list[dict[str, Any]]:
+        """Get list of available form fields"""
+        if form_id not in self.form_templates:
+            return []
+
+        try:
+            # pylint: disable=import-outside-toplevel
+            from pdfrw import PdfReader
+
+            template_info = self.form_templates[form_id]
+            template = PdfReader(template_info["path"])
+
+            if template.Root.AcroForm is None:  # fmt: skip # pyright: ignore[reportOptionalMemberAccess, reportAttributeAccessIssue]
+                return []
+
+            fields = []
+            for field in template.Root.AcroForm.Fields:  # fmt: skip # pyright: ignore[reportOptionalMemberAccess, reportAttributeAccessIssue]
+                field_info = {
+                    "name": field.T[1:-1] if field.T else None,  # Remove parentheses
+                    "type": self._get_field_type(field),
+                    "required": bool(field.Ff and int(field.Ff) & 2),  # Required flag
+                    "readonly": bool(field.Ff and int(field.Ff) & 1),  # Read-only flag
+                }
+
+                if field.V:
+                    field_info["default_value"] = str(field.V)
+
+                fields.append(field_info)
+
+            return fields
+
+        except Exception as e:  # pylint: disable=broad-exception-caught
+            logger.error("Failed to get form fields", form_id=form_id, error=str(e))
+            return []
+
+    def _get_field_type(self, field: Any) -> str:
+        """Determine field type from PDF field"""
+        if hasattr(field, "FT"):
+            field_type = str(field.FT)
+            if "Tx" in field_type:
+                return "text"
+            if "Btn" in field_type:
+                return "checkbox" if field.Ff and int(field.Ff) & 32768 else "button"
+            if "Ch" in field_type:
+                return "choice"
+        return "unknown"