Files
ai-tax-agent/libs/schemas/events.py
harkon fdba81809f
Some checks failed
CI/CD Pipeline / Generate SBOM (push) Has been cancelled
CI/CD Pipeline / Deploy to Staging (push) Has been cancelled
CI/CD Pipeline / Deploy to Production (push) Has been cancelled
CI/CD Pipeline / Code Quality & Linting (push) Has been cancelled
CI/CD Pipeline / Policy Validation (push) Has been cancelled
CI/CD Pipeline / Test Suite (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-firm-connectors) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-forms) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-hmrc) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ingestion) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-normalize-map) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ocr) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-indexer) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-reason) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rpa) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (ui-review) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (ui-review) (push) Has been cancelled
CI/CD Pipeline / Notifications (push) Has been cancelled
completed local setup with compose
2025-11-26 13:17:17 +00:00

310 lines
12 KiB
Python

"""Typed event payload schemas for validation and type safety."""
from typing import Any, Literal
from pydantic import BaseModel, ConfigDict, Field, field_validator
# Base schema for all events
class BaseEventData(BaseModel):
"""Base class for all event data payloads."""
model_config = ConfigDict(
extra="forbid", # Prevent unexpected fields
frozen=True, # Make immutable
)
# Document lifecycle events
class DocumentIngestedEventData(BaseEventData):
"""Event emitted when a document is successfully ingested."""
doc_id: str = Field(..., description="Unique document identifier (ULID)")
filename: str = Field(..., description="Original filename")
mime_type: str = Field(..., description="MIME type of the document")
size_bytes: int = Field(..., ge=0, description="File size in bytes")
checksum_sha256: str = Field(..., description="SHA-256 checksum for integrity")
kind: str = Field(
..., description="Document kind (invoice, receipt, bank_statement, etc.)"
)
source: str = Field(
..., description="Ingestion source (manual_upload, rpa, email, api)"
)
storage_path: str = Field(..., description="MinIO object storage path")
metadata: dict[str, Any] = Field(
default_factory=dict, description="Additional metadata"
)
@field_validator("checksum_sha256")
@classmethod
def validate_checksum(cls, v: str) -> str:
"""Validate SHA-256 checksum format."""
if len(v) != 64 or not all(c in "0123456789abcdef" for c in v.lower()):
raise ValueError("Invalid SHA-256 checksum format")
return v.lower()
class DocumentOCRReadyEventData(BaseEventData):
"""Event emitted when OCR processing is complete."""
doc_id: str = Field(..., description="Document identifier")
ocr_engine: Literal["tesseract", "textract", "azure_ocr"] = Field(
..., description="OCR engine used"
)
page_count: int = Field(..., ge=1, description="Number of pages processed")
confidence_avg: float = Field(
..., ge=0.0, le=1.0, description="Average OCR confidence score"
)
text_length: int = Field(..., ge=0, description="Total extracted text length")
layout_detected: bool = Field(
..., description="Whether document layout was successfully detected"
)
languages_detected: list[str] = Field(
default_factory=list, description="Detected languages (ISO 639-1 codes)"
)
processing_time_ms: int = Field(
..., ge=0, description="Processing time in milliseconds"
)
storage_path: str = Field(..., description="Path to OCR results in storage")
class DocumentExtractedEventData(BaseEventData):
"""Event emitted when field extraction is complete."""
doc_id: str = Field(..., description="Document identifier")
extraction_id: str = Field(..., description="Unique extraction run identifier")
strategy: Literal["llm", "rules", "hybrid"] = Field(
..., description="Extraction strategy used"
)
fields_extracted: int = Field(..., ge=0, description="Number of fields extracted")
confidence_avg: float = Field(
..., ge=0.0, le=1.0, description="Average extraction confidence"
)
calibrated_confidence: float = Field(
..., ge=0.0, le=1.0, description="Calibrated confidence score"
)
model_name: str | None = Field(None, description="LLM model used (if applicable)")
processing_time_ms: int = Field(
..., ge=0, description="Processing time in milliseconds"
)
storage_path: str = Field(..., description="Path to extraction results")
# Knowledge Graph events
class KGUpsertReadyEventData(BaseEventData):
"""Event emitted when KG upsert data is ready."""
doc_id: str = Field(..., description="Source document identifier")
entity_count: int = Field(..., ge=0, description="Number of entities to upsert")
relationship_count: int = Field(
..., ge=0, description="Number of relationships to upsert"
)
tax_year: str = Field(..., description="Tax year (e.g., '2024-25')")
taxpayer_id: str = Field(..., description="Taxpayer identifier")
normalization_id: str = Field(..., description="Normalization run identifier")
storage_path: str = Field(..., description="Path to normalized data")
class KGUpsertedEventData(BaseEventData):
"""Event emitted when KG upsert is complete."""
doc_id: str = Field(..., description="Source document identifier")
entities_created: int = Field(..., ge=0, description="Entities created")
entities_updated: int = Field(..., ge=0, description="Entities updated")
relationships_created: int = Field(..., ge=0, description="Relationships created")
relationships_updated: int = Field(..., ge=0, description="Relationships updated")
shacl_violations: int = Field(
..., ge=0, description="Number of SHACL validation violations"
)
processing_time_ms: int = Field(
..., ge=0, description="Processing time in milliseconds"
)
success: bool = Field(..., description="Whether upsert was successful")
error_message: str | None = Field(None, description="Error message if failed")
# RAG events
class RAGIndexedEventData(BaseEventData):
"""Event emitted when RAG indexing is complete."""
doc_id: str = Field(..., description="Source document identifier")
collection_name: str = Field(..., description="Qdrant collection name")
chunks_indexed: int = Field(..., ge=0, description="Number of chunks indexed")
embedding_model: str = Field(..., description="Embedding model used")
pii_detected: bool = Field(..., description="Whether PII was detected")
pii_redacted: bool = Field(..., description="Whether PII was redacted")
processing_time_ms: int = Field(
..., ge=0, description="Processing time in milliseconds"
)
storage_path: str = Field(..., description="Path to chunked data")
# Calculation events
class CalculationReadyEventData(BaseEventData):
"""Event emitted when tax calculation is complete."""
taxpayer_id: str = Field(..., description="Taxpayer identifier")
tax_year: str = Field(..., description="Tax year (e.g., '2024-25')")
schedule_id: str = Field(..., description="Tax schedule identifier (SA102, SA103)")
calculation_id: str = Field(..., description="Unique calculation run identifier")
boxes_computed: int = Field(..., ge=0, description="Number of form boxes computed")
total_income: float | None = Field(None, description="Total income calculated")
total_tax: float | None = Field(None, description="Total tax calculated")
confidence: float = Field(
..., ge=0.0, le=1.0, description="Calculation confidence score"
)
evidence_count: int = Field(
..., ge=0, description="Number of evidence items supporting calculation"
)
processing_time_ms: int = Field(
..., ge=0, description="Processing time in milliseconds"
)
storage_path: str = Field(..., description="Path to calculation results")
# Form events
class FormFilledEventData(BaseEventData):
"""Event emitted when PDF form filling is complete."""
taxpayer_id: str = Field(..., description="Taxpayer identifier")
tax_year: str = Field(..., description="Tax year (e.g., '2024-25')")
form_id: str = Field(..., description="Form identifier (SA100, SA102, etc.)")
fields_filled: int = Field(..., ge=0, description="Number of fields filled")
pdf_size_bytes: int = Field(..., ge=0, description="Generated PDF size in bytes")
storage_path: str = Field(..., description="Path to filled PDF")
evidence_bundle_path: str | None = Field(
None, description="Path to evidence bundle ZIP"
)
checksum_sha256: str = Field(..., description="PDF checksum for integrity")
# HMRC events
class HMRCSubmittedEventData(BaseEventData):
"""Event emitted when HMRC submission is complete."""
taxpayer_id: str = Field(..., description="Taxpayer identifier")
tax_year: str = Field(..., description="Tax year (e.g., '2024-25')")
submission_id: str = Field(..., description="Unique submission identifier")
hmrc_reference: str | None = Field(None, description="HMRC submission reference")
submission_type: Literal["dry_run", "sandbox", "live"] = Field(
..., description="Submission environment type"
)
success: bool = Field(..., description="Whether submission was successful")
status_code: int | None = Field(None, description="HTTP status code")
error_message: str | None = Field(None, description="Error message if failed")
processing_time_ms: int = Field(
..., ge=0, description="Processing time in milliseconds"
)
# Review events
class ReviewRequestedEventData(BaseEventData):
"""Event emitted when human review is requested."""
doc_id: str = Field(..., description="Document identifier")
review_type: Literal["extraction", "calculation", "submission"] = Field(
..., description="Type of review needed"
)
priority: Literal["low", "medium", "high", "urgent"] = Field(
..., description="Review priority level"
)
reason: str = Field(..., description="Reason for review request")
assigned_to: str | None = Field(None, description="User assigned to review")
due_date: str | None = Field(None, description="Review due date (ISO 8601)")
metadata: dict[str, Any] = Field(
default_factory=dict, description="Additional review metadata"
)
class ReviewCompletedEventData(BaseEventData):
"""Event emitted when human review is completed."""
doc_id: str = Field(..., description="Document identifier")
review_id: str = Field(..., description="Review session identifier")
reviewer: str = Field(..., description="User who completed review")
decision: Literal["approved", "rejected", "needs_revision"] = Field(
..., description="Review decision"
)
changes_made: int = Field(..., ge=0, description="Number of changes made")
comments: str | None = Field(None, description="Reviewer comments")
review_duration_seconds: int = Field(
..., ge=0, description="Time spent in review (seconds)"
)
# Firm sync events
class FirmSyncCompletedEventData(BaseEventData):
"""Event emitted when firm database sync is complete."""
firm_id: str = Field(..., description="Firm identifier")
connector_type: str = Field(
..., description="Connector type (iris, sage, xero, etc.)"
)
sync_id: str = Field(..., description="Unique sync run identifier")
records_synced: int = Field(..., ge=0, description="Number of records synced")
records_created: int = Field(..., ge=0, description="Records created")
records_updated: int = Field(..., ge=0, description="Records updated")
records_failed: int = Field(..., ge=0, description="Records that failed to sync")
success: bool = Field(..., description="Whether sync was successful")
error_message: str | None = Field(None, description="Error message if failed")
processing_time_ms: int = Field(
..., ge=0, description="Processing time in milliseconds"
)
# Schema mapping for topic -> data class
EVENT_SCHEMA_MAP: dict[str, type[BaseEventData]] = {
"doc.ingested": DocumentIngestedEventData,
"doc.ocr_ready": DocumentOCRReadyEventData,
"doc.extracted": DocumentExtractedEventData,
"kg.upsert.ready": KGUpsertReadyEventData,
"kg.upserted": KGUpsertedEventData,
"rag.indexed": RAGIndexedEventData,
"calc.schedule_ready": CalculationReadyEventData,
"form.filled": FormFilledEventData,
"hmrc.submitted": HMRCSubmittedEventData,
"review.requested": ReviewRequestedEventData,
"review.completed": ReviewCompletedEventData,
"firm.sync.completed": FirmSyncCompletedEventData,
}
def validate_event_data(topic: str, data: dict[str, Any]) -> BaseEventData:
"""
Validate event data against the schema for the given topic.
Args:
topic: Event topic name
data: Raw event data dictionary
Returns:
Validated event data model
Raises:
ValueError: If topic is unknown or validation fails
"""
if topic not in EVENT_SCHEMA_MAP:
raise ValueError(f"Unknown event topic: {topic}")
schema_class = EVENT_SCHEMA_MAP[topic]
return schema_class.model_validate(data)
def get_schema_for_topic(topic: str) -> type[BaseEventData]:
"""
Get the Pydantic schema class for a given topic.
Args:
topic: Event topic name
Returns:
Schema class for the topic
Raises:
ValueError: If topic is unknown
"""
if topic not in EVENT_SCHEMA_MAP:
raise ValueError(f"Unknown event topic: {topic}")
return EVENT_SCHEMA_MAP[topic]