"""Typed event payload schemas for validation and type safety.""" from typing import Any, Literal from pydantic import BaseModel, ConfigDict, Field, field_validator # Base schema for all events class BaseEventData(BaseModel): """Base class for all event data payloads.""" model_config = ConfigDict( extra="forbid", # Prevent unexpected fields frozen=True, # Make immutable ) # Document lifecycle events class DocumentIngestedEventData(BaseEventData): """Event emitted when a document is successfully ingested.""" doc_id: str = Field(..., description="Unique document identifier (ULID)") filename: str = Field(..., description="Original filename") mime_type: str = Field(..., description="MIME type of the document") size_bytes: int = Field(..., ge=0, description="File size in bytes") checksum_sha256: str = Field(..., description="SHA-256 checksum for integrity") kind: str = Field( ..., description="Document kind (invoice, receipt, bank_statement, etc.)" ) source: str = Field( ..., description="Ingestion source (manual_upload, rpa, email, api)" ) storage_path: str = Field(..., description="MinIO object storage path") metadata: dict[str, Any] = Field( default_factory=dict, description="Additional metadata" ) @field_validator("checksum_sha256") @classmethod def validate_checksum(cls, v: str) -> str: """Validate SHA-256 checksum format.""" if len(v) != 64 or not all(c in "0123456789abcdef" for c in v.lower()): raise ValueError("Invalid SHA-256 checksum format") return v.lower() class DocumentOCRReadyEventData(BaseEventData): """Event emitted when OCR processing is complete.""" doc_id: str = Field(..., description="Document identifier") ocr_engine: Literal["tesseract", "textract", "azure_ocr"] = Field( ..., description="OCR engine used" ) page_count: int = Field(..., ge=1, description="Number of pages processed") confidence_avg: float = Field( ..., ge=0.0, le=1.0, description="Average OCR confidence score" ) text_length: int = Field(..., ge=0, description="Total extracted text length") layout_detected: bool = Field( ..., description="Whether document layout was successfully detected" ) languages_detected: list[str] = Field( default_factory=list, description="Detected languages (ISO 639-1 codes)" ) processing_time_ms: int = Field( ..., ge=0, description="Processing time in milliseconds" ) storage_path: str = Field(..., description="Path to OCR results in storage") class DocumentExtractedEventData(BaseEventData): """Event emitted when field extraction is complete.""" doc_id: str = Field(..., description="Document identifier") tenant_id: str = Field(..., description="Tenant identifier") extraction_id: str = Field(..., description="Unique extraction run identifier") strategy: Literal["llm", "rules", "hybrid"] = Field( ..., description="Extraction strategy used" ) field_count: int = Field(..., ge=0, description="Number of fields extracted") confidence: float = Field( ..., ge=0.0, le=1.0, description="Extraction confidence score" ) extraction_results: dict[str, Any] = Field( ..., description="Full extraction results including provenance" ) model_name: str | None = Field(None, description="LLM model used (if applicable)") processing_time_ms: int | None = Field( None, ge=0, description="Processing time in milliseconds" ) storage_path: str | None = Field(None, description="Path to extraction results") # Knowledge Graph events class KGUpsertReadyEventData(BaseEventData): """Event emitted when KG upsert data is ready.""" doc_id: str = Field(..., description="Source document identifier") entity_count: int = Field(..., ge=0, description="Number of entities to upsert") relationship_count: int = Field( ..., ge=0, description="Number of relationships to upsert" ) tax_year: str = Field(..., description="Tax year (e.g., '2024-25')") taxpayer_id: str = Field(..., description="Taxpayer identifier") normalization_id: str = Field(..., description="Normalization run identifier") storage_path: str = Field(..., description="Path to normalized data") class KGUpsertedEventData(BaseEventData): """Event emitted when KG upsert is complete.""" doc_id: str = Field(..., description="Source document identifier") entities_created: int = Field(..., ge=0, description="Entities created") entities_updated: int = Field(..., ge=0, description="Entities updated") relationships_created: int = Field(..., ge=0, description="Relationships created") relationships_updated: int = Field(..., ge=0, description="Relationships updated") shacl_violations: int = Field( ..., ge=0, description="Number of SHACL validation violations" ) processing_time_ms: int = Field( ..., ge=0, description="Processing time in milliseconds" ) success: bool = Field(..., description="Whether upsert was successful") error_message: str | None = Field(None, description="Error message if failed") # RAG events class RAGIndexedEventData(BaseEventData): """Event emitted when RAG indexing is complete.""" doc_id: str = Field(..., description="Source document identifier") collection_name: str = Field(..., description="Qdrant collection name") chunks_indexed: int = Field(..., ge=0, description="Number of chunks indexed") embedding_model: str = Field(..., description="Embedding model used") pii_detected: bool = Field(..., description="Whether PII was detected") pii_redacted: bool = Field(..., description="Whether PII was redacted") processing_time_ms: int = Field( ..., ge=0, description="Processing time in milliseconds" ) storage_path: str = Field(..., description="Path to chunked data") # Calculation events class CalculationReadyEventData(BaseEventData): """Event emitted when tax calculation is complete.""" taxpayer_id: str = Field(..., description="Taxpayer identifier") tax_year: str = Field(..., description="Tax year (e.g., '2024-25')") schedule_id: str = Field(..., description="Tax schedule identifier (SA102, SA103)") calculation_id: str = Field(..., description="Unique calculation run identifier") boxes_computed: int = Field(..., ge=0, description="Number of form boxes computed") total_income: float | None = Field(None, description="Total income calculated") total_tax: float | None = Field(None, description="Total tax calculated") confidence: float = Field( ..., ge=0.0, le=1.0, description="Calculation confidence score" ) evidence_count: int = Field( ..., ge=0, description="Number of evidence items supporting calculation" ) processing_time_ms: int = Field( ..., ge=0, description="Processing time in milliseconds" ) storage_path: str = Field(..., description="Path to calculation results") # Form events class FormFilledEventData(BaseEventData): """Event emitted when PDF form filling is complete.""" taxpayer_id: str = Field(..., description="Taxpayer identifier") tax_year: str = Field(..., description="Tax year (e.g., '2024-25')") form_id: str = Field(..., description="Form identifier (SA100, SA102, etc.)") fields_filled: int = Field(..., ge=0, description="Number of fields filled") pdf_size_bytes: int = Field(..., ge=0, description="Generated PDF size in bytes") storage_path: str = Field(..., description="Path to filled PDF") evidence_bundle_path: str | None = Field( None, description="Path to evidence bundle ZIP" ) checksum_sha256: str = Field(..., description="PDF checksum for integrity") # HMRC events class HMRCSubmittedEventData(BaseEventData): """Event emitted when HMRC submission is complete.""" taxpayer_id: str = Field(..., description="Taxpayer identifier") tax_year: str = Field(..., description="Tax year (e.g., '2024-25')") submission_id: str = Field(..., description="Unique submission identifier") hmrc_reference: str | None = Field(None, description="HMRC submission reference") submission_type: Literal["dry_run", "sandbox", "live"] = Field( ..., description="Submission environment type" ) success: bool = Field(..., description="Whether submission was successful") status_code: int | None = Field(None, description="HTTP status code") error_message: str | None = Field(None, description="Error message if failed") processing_time_ms: int = Field( ..., ge=0, description="Processing time in milliseconds" ) # Review events class ReviewRequestedEventData(BaseEventData): """Event emitted when human review is requested.""" doc_id: str = Field(..., description="Document identifier") review_type: Literal["extraction", "calculation", "submission"] = Field( ..., description="Type of review needed" ) priority: Literal["low", "medium", "high", "urgent"] = Field( ..., description="Review priority level" ) reason: str = Field(..., description="Reason for review request") assigned_to: str | None = Field(None, description="User assigned to review") due_date: str | None = Field(None, description="Review due date (ISO 8601)") metadata: dict[str, Any] = Field( default_factory=dict, description="Additional review metadata" ) class ReviewCompletedEventData(BaseEventData): """Event emitted when human review is completed.""" doc_id: str = Field(..., description="Document identifier") review_id: str = Field(..., description="Review session identifier") reviewer: str = Field(..., description="User who completed review") decision: Literal["approved", "rejected", "needs_revision"] = Field( ..., description="Review decision" ) changes_made: int = Field(..., ge=0, description="Number of changes made") comments: str | None = Field(None, description="Reviewer comments") review_duration_seconds: int = Field( ..., ge=0, description="Time spent in review (seconds)" ) # Firm sync events class FirmSyncCompletedEventData(BaseEventData): """Event emitted when firm database sync is complete.""" firm_id: str = Field(..., description="Firm identifier") connector_type: str = Field( ..., description="Connector type (iris, sage, xero, etc.)" ) sync_id: str = Field(..., description="Unique sync run identifier") records_synced: int = Field(..., ge=0, description="Number of records synced") records_created: int = Field(..., ge=0, description="Records created") records_updated: int = Field(..., ge=0, description="Records updated") records_failed: int = Field(..., ge=0, description="Records that failed to sync") success: bool = Field(..., description="Whether sync was successful") error_message: str | None = Field(None, description="Error message if failed") processing_time_ms: int = Field( ..., ge=0, description="Processing time in milliseconds" ) # Schema mapping for topic -> data class EVENT_SCHEMA_MAP: dict[str, type[BaseEventData]] = { "doc.ingested": DocumentIngestedEventData, "doc.ocr_ready": DocumentOCRReadyEventData, "doc.extracted": DocumentExtractedEventData, "kg.upsert.ready": KGUpsertReadyEventData, "kg.upserted": KGUpsertedEventData, "rag.indexed": RAGIndexedEventData, "calc.schedule_ready": CalculationReadyEventData, "form.filled": FormFilledEventData, "hmrc.submitted": HMRCSubmittedEventData, "review.requested": ReviewRequestedEventData, "review.completed": ReviewCompletedEventData, "firm.sync.completed": FirmSyncCompletedEventData, } def validate_event_data(topic: str, data: dict[str, Any]) -> BaseEventData: """ Validate event data against the schema for the given topic. Args: topic: Event topic name data: Raw event data dictionary Returns: Validated event data model Raises: ValueError: If topic is unknown or validation fails """ if topic not in EVENT_SCHEMA_MAP: raise ValueError(f"Unknown event topic: {topic}") schema_class = EVENT_SCHEMA_MAP[topic] return schema_class.model_validate(data) def get_schema_for_topic(topic: str) -> type[BaseEventData]: """ Get the Pydantic schema class for a given topic. Args: topic: Event topic name Returns: Schema class for the topic Raises: ValueError: If topic is unknown """ if topic not in EVENT_SCHEMA_MAP: raise ValueError(f"Unknown event topic: {topic}") return EVENT_SCHEMA_MAP[topic]