ai-tax-agent/libs/schemas/events.py

"""Typed event payload schemas for validation and type safety."""

from typing import Any, Literal

from pydantic import BaseModel, ConfigDict, Field, field_validator


# Base schema for all events
class BaseEventData(BaseModel):
    """Base class for all event data payloads."""

    model_config = ConfigDict(
        extra="forbid",  # Prevent unexpected fields
        frozen=True,  # Make immutable
    )


# Document lifecycle events
class DocumentIngestedEventData(BaseEventData):
    """Event emitted when a document is successfully ingested."""

    doc_id: str = Field(..., description="Unique document identifier (ULID)")
    filename: str = Field(..., description="Original filename")
    mime_type: str = Field(..., description="MIME type of the document")
    size_bytes: int = Field(..., ge=0, description="File size in bytes")
    checksum_sha256: str = Field(..., description="SHA-256 checksum for integrity")
    kind: str = Field(
        ..., description="Document kind (invoice, receipt, bank_statement, etc.)"
    )
    source: str = Field(
        ..., description="Ingestion source (manual_upload, rpa, email, api)"
    )
    storage_path: str = Field(..., description="MinIO object storage path")
    metadata: dict[str, Any] = Field(
        default_factory=dict, description="Additional metadata"
    )

    @field_validator("checksum_sha256")
    @classmethod
    def validate_checksum(cls, v: str) -> str:
        """Validate SHA-256 checksum format."""
        if len(v) != 64 or not all(c in "0123456789abcdef" for c in v.lower()):
            raise ValueError("Invalid SHA-256 checksum format")
        return v.lower()


class DocumentOCRReadyEventData(BaseEventData):
    """Event emitted when OCR processing is complete."""

    doc_id: str = Field(..., description="Document identifier")
    ocr_engine: Literal["tesseract", "textract", "azure_ocr"] = Field(
        ..., description="OCR engine used"
    )
    page_count: int = Field(..., ge=1, description="Number of pages processed")
    confidence_avg: float = Field(
        ..., ge=0.0, le=1.0, description="Average OCR confidence score"
    )
    text_length: int = Field(..., ge=0, description="Total extracted text length")
    layout_detected: bool = Field(
        ..., description="Whether document layout was successfully detected"
    )
    languages_detected: list[str] = Field(
        default_factory=list, description="Detected languages (ISO 639-1 codes)"
    )
    processing_time_ms: int = Field(
        ..., ge=0, description="Processing time in milliseconds"
    )
    storage_path: str = Field(..., description="Path to OCR results in storage")


class DocumentExtractedEventData(BaseEventData):
    """Event emitted when field extraction is complete."""

    doc_id: str = Field(..., description="Document identifier")
    extraction_id: str = Field(..., description="Unique extraction run identifier")
    strategy: Literal["llm", "rules", "hybrid"] = Field(
        ..., description="Extraction strategy used"
    )
    fields_extracted: int = Field(..., ge=0, description="Number of fields extracted")
    confidence_avg: float = Field(
        ..., ge=0.0, le=1.0, description="Average extraction confidence"
    )
    calibrated_confidence: float = Field(
        ..., ge=0.0, le=1.0, description="Calibrated confidence score"
    )
    model_name: str | None = Field(None, description="LLM model used (if applicable)")
    processing_time_ms: int = Field(
        ..., ge=0, description="Processing time in milliseconds"
    )
    storage_path: str = Field(..., description="Path to extraction results")


# Knowledge Graph events
class KGUpsertReadyEventData(BaseEventData):
    """Event emitted when KG upsert data is ready."""

    doc_id: str = Field(..., description="Source document identifier")
    entity_count: int = Field(..., ge=0, description="Number of entities to upsert")
    relationship_count: int = Field(
        ..., ge=0, description="Number of relationships to upsert"
    )
    tax_year: str = Field(..., description="Tax year (e.g., '2024-25')")
    taxpayer_id: str = Field(..., description="Taxpayer identifier")
    normalization_id: str = Field(..., description="Normalization run identifier")
    storage_path: str = Field(..., description="Path to normalized data")


class KGUpsertedEventData(BaseEventData):
    """Event emitted when KG upsert is complete."""

    doc_id: str = Field(..., description="Source document identifier")
    entities_created: int = Field(..., ge=0, description="Entities created")
    entities_updated: int = Field(..., ge=0, description="Entities updated")
    relationships_created: int = Field(..., ge=0, description="Relationships created")
    relationships_updated: int = Field(..., ge=0, description="Relationships updated")
    shacl_violations: int = Field(
        ..., ge=0, description="Number of SHACL validation violations"
    )
    processing_time_ms: int = Field(
        ..., ge=0, description="Processing time in milliseconds"
    )
    success: bool = Field(..., description="Whether upsert was successful")
    error_message: str | None = Field(None, description="Error message if failed")


# RAG events
class RAGIndexedEventData(BaseEventData):
    """Event emitted when RAG indexing is complete."""

    doc_id: str = Field(..., description="Source document identifier")
    collection_name: str = Field(..., description="Qdrant collection name")
    chunks_indexed: int = Field(..., ge=0, description="Number of chunks indexed")
    embedding_model: str = Field(..., description="Embedding model used")
    pii_detected: bool = Field(..., description="Whether PII was detected")
    pii_redacted: bool = Field(..., description="Whether PII was redacted")
    processing_time_ms: int = Field(
        ..., ge=0, description="Processing time in milliseconds"
    )
    storage_path: str = Field(..., description="Path to chunked data")


# Calculation events
class CalculationReadyEventData(BaseEventData):
    """Event emitted when tax calculation is complete."""

    taxpayer_id: str = Field(..., description="Taxpayer identifier")
    tax_year: str = Field(..., description="Tax year (e.g., '2024-25')")
    schedule_id: str = Field(..., description="Tax schedule identifier (SA102, SA103)")
    calculation_id: str = Field(..., description="Unique calculation run identifier")
    boxes_computed: int = Field(..., ge=0, description="Number of form boxes computed")
    total_income: float | None = Field(None, description="Total income calculated")
    total_tax: float | None = Field(None, description="Total tax calculated")
    confidence: float = Field(
        ..., ge=0.0, le=1.0, description="Calculation confidence score"
    )
    evidence_count: int = Field(
        ..., ge=0, description="Number of evidence items supporting calculation"
    )
    processing_time_ms: int = Field(
        ..., ge=0, description="Processing time in milliseconds"
    )
    storage_path: str = Field(..., description="Path to calculation results")


# Form events
class FormFilledEventData(BaseEventData):
    """Event emitted when PDF form filling is complete."""

    taxpayer_id: str = Field(..., description="Taxpayer identifier")
    tax_year: str = Field(..., description="Tax year (e.g., '2024-25')")
    form_id: str = Field(..., description="Form identifier (SA100, SA102, etc.)")
    fields_filled: int = Field(..., ge=0, description="Number of fields filled")
    pdf_size_bytes: int = Field(..., ge=0, description="Generated PDF size in bytes")
    storage_path: str = Field(..., description="Path to filled PDF")
    evidence_bundle_path: str | None = Field(
        None, description="Path to evidence bundle ZIP"
    )
    checksum_sha256: str = Field(..., description="PDF checksum for integrity")


# HMRC events
class HMRCSubmittedEventData(BaseEventData):
    """Event emitted when HMRC submission is complete."""

    taxpayer_id: str = Field(..., description="Taxpayer identifier")
    tax_year: str = Field(..., description="Tax year (e.g., '2024-25')")
    submission_id: str = Field(..., description="Unique submission identifier")
    hmrc_reference: str | None = Field(None, description="HMRC submission reference")
    submission_type: Literal["dry_run", "sandbox", "live"] = Field(
        ..., description="Submission environment type"
    )
    success: bool = Field(..., description="Whether submission was successful")
    status_code: int | None = Field(None, description="HTTP status code")
    error_message: str | None = Field(None, description="Error message if failed")
    processing_time_ms: int = Field(
        ..., ge=0, description="Processing time in milliseconds"
    )


# Review events
class ReviewRequestedEventData(BaseEventData):
    """Event emitted when human review is requested."""

    doc_id: str = Field(..., description="Document identifier")
    review_type: Literal["extraction", "calculation", "submission"] = Field(
        ..., description="Type of review needed"
    )
    priority: Literal["low", "medium", "high", "urgent"] = Field(
        ..., description="Review priority level"
    )
    reason: str = Field(..., description="Reason for review request")
    assigned_to: str | None = Field(None, description="User assigned to review")
    due_date: str | None = Field(None, description="Review due date (ISO 8601)")
    metadata: dict[str, Any] = Field(
        default_factory=dict, description="Additional review metadata"
    )


class ReviewCompletedEventData(BaseEventData):
    """Event emitted when human review is completed."""

    doc_id: str = Field(..., description="Document identifier")
    review_id: str = Field(..., description="Review session identifier")
    reviewer: str = Field(..., description="User who completed review")
    decision: Literal["approved", "rejected", "needs_revision"] = Field(
        ..., description="Review decision"
    )
    changes_made: int = Field(..., ge=0, description="Number of changes made")
    comments: str | None = Field(None, description="Reviewer comments")
    review_duration_seconds: int = Field(
        ..., ge=0, description="Time spent in review (seconds)"
    )


# Firm sync events
class FirmSyncCompletedEventData(BaseEventData):
    """Event emitted when firm database sync is complete."""

    firm_id: str = Field(..., description="Firm identifier")
    connector_type: str = Field(
        ..., description="Connector type (iris, sage, xero, etc.)"
    )
    sync_id: str = Field(..., description="Unique sync run identifier")
    records_synced: int = Field(..., ge=0, description="Number of records synced")
    records_created: int = Field(..., ge=0, description="Records created")
    records_updated: int = Field(..., ge=0, description="Records updated")
    records_failed: int = Field(..., ge=0, description="Records that failed to sync")
    success: bool = Field(..., description="Whether sync was successful")
    error_message: str | None = Field(None, description="Error message if failed")
    processing_time_ms: int = Field(
        ..., ge=0, description="Processing time in milliseconds"
    )


# Schema mapping for topic -> data class
EVENT_SCHEMA_MAP: dict[str, type[BaseEventData]] = {
    "doc.ingested": DocumentIngestedEventData,
    "doc.ocr_ready": DocumentOCRReadyEventData,
    "doc.extracted": DocumentExtractedEventData,
    "kg.upsert.ready": KGUpsertReadyEventData,
    "kg.upserted": KGUpsertedEventData,
    "rag.indexed": RAGIndexedEventData,
    "calc.schedule_ready": CalculationReadyEventData,
    "form.filled": FormFilledEventData,
    "hmrc.submitted": HMRCSubmittedEventData,
    "review.requested": ReviewRequestedEventData,
    "review.completed": ReviewCompletedEventData,
    "firm.sync.completed": FirmSyncCompletedEventData,
}


def validate_event_data(topic: str, data: dict[str, Any]) -> BaseEventData:
    """
    Validate event data against the schema for the given topic.

    Args:
        topic: Event topic name
        data: Raw event data dictionary

    Returns:
        Validated event data model

    Raises:
        ValueError: If topic is unknown or validation fails
    """
    if topic not in EVENT_SCHEMA_MAP:
        raise ValueError(f"Unknown event topic: {topic}")

    schema_class = EVENT_SCHEMA_MAP[topic]
    return schema_class.model_validate(data)


def get_schema_for_topic(topic: str) -> type[BaseEventData]:
    """
    Get the Pydantic schema class for a given topic.

    Args:
        topic: Event topic name

    Returns:
        Schema class for the topic

    Raises:
        ValueError: If topic is unknown
    """
    if topic not in EVENT_SCHEMA_MAP:
        raise ValueError(f"Unknown event topic: {topic}")

    return EVENT_SCHEMA_MAP[topic]