Some checks failed
CI/CD Pipeline / Generate SBOM (push) Has been cancelled
CI/CD Pipeline / Deploy to Staging (push) Has been cancelled
CI/CD Pipeline / Deploy to Production (push) Has been cancelled
CI/CD Pipeline / Code Quality & Linting (push) Has been cancelled
CI/CD Pipeline / Policy Validation (push) Has been cancelled
CI/CD Pipeline / Test Suite (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-firm-connectors) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-forms) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-hmrc) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ingestion) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-normalize-map) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ocr) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-indexer) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-reason) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rpa) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (ui-review) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (ui-review) (push) Has been cancelled
CI/CD Pipeline / Notifications (push) Has been cancelled
310 lines
12 KiB
Python
310 lines
12 KiB
Python
"""Typed event payload schemas for validation and type safety."""
|
|
|
|
from typing import Any, Literal
|
|
|
|
from pydantic import BaseModel, ConfigDict, Field, field_validator
|
|
|
|
|
|
# Base schema for all events
|
|
class BaseEventData(BaseModel):
|
|
"""Base class for all event data payloads."""
|
|
|
|
model_config = ConfigDict(
|
|
extra="forbid", # Prevent unexpected fields
|
|
frozen=True, # Make immutable
|
|
)
|
|
|
|
|
|
# Document lifecycle events
|
|
class DocumentIngestedEventData(BaseEventData):
|
|
"""Event emitted when a document is successfully ingested."""
|
|
|
|
doc_id: str = Field(..., description="Unique document identifier (ULID)")
|
|
filename: str = Field(..., description="Original filename")
|
|
mime_type: str = Field(..., description="MIME type of the document")
|
|
size_bytes: int = Field(..., ge=0, description="File size in bytes")
|
|
checksum_sha256: str = Field(..., description="SHA-256 checksum for integrity")
|
|
kind: str = Field(
|
|
..., description="Document kind (invoice, receipt, bank_statement, etc.)"
|
|
)
|
|
source: str = Field(
|
|
..., description="Ingestion source (manual_upload, rpa, email, api)"
|
|
)
|
|
storage_path: str = Field(..., description="MinIO object storage path")
|
|
metadata: dict[str, Any] = Field(
|
|
default_factory=dict, description="Additional metadata"
|
|
)
|
|
|
|
@field_validator("checksum_sha256")
|
|
@classmethod
|
|
def validate_checksum(cls, v: str) -> str:
|
|
"""Validate SHA-256 checksum format."""
|
|
if len(v) != 64 or not all(c in "0123456789abcdef" for c in v.lower()):
|
|
raise ValueError("Invalid SHA-256 checksum format")
|
|
return v.lower()
|
|
|
|
|
|
class DocumentOCRReadyEventData(BaseEventData):
|
|
"""Event emitted when OCR processing is complete."""
|
|
|
|
doc_id: str = Field(..., description="Document identifier")
|
|
ocr_engine: Literal["tesseract", "textract", "azure_ocr"] = Field(
|
|
..., description="OCR engine used"
|
|
)
|
|
page_count: int = Field(..., ge=1, description="Number of pages processed")
|
|
confidence_avg: float = Field(
|
|
..., ge=0.0, le=1.0, description="Average OCR confidence score"
|
|
)
|
|
text_length: int = Field(..., ge=0, description="Total extracted text length")
|
|
layout_detected: bool = Field(
|
|
..., description="Whether document layout was successfully detected"
|
|
)
|
|
languages_detected: list[str] = Field(
|
|
default_factory=list, description="Detected languages (ISO 639-1 codes)"
|
|
)
|
|
processing_time_ms: int = Field(
|
|
..., ge=0, description="Processing time in milliseconds"
|
|
)
|
|
storage_path: str = Field(..., description="Path to OCR results in storage")
|
|
|
|
|
|
class DocumentExtractedEventData(BaseEventData):
|
|
"""Event emitted when field extraction is complete."""
|
|
|
|
doc_id: str = Field(..., description="Document identifier")
|
|
extraction_id: str = Field(..., description="Unique extraction run identifier")
|
|
strategy: Literal["llm", "rules", "hybrid"] = Field(
|
|
..., description="Extraction strategy used"
|
|
)
|
|
fields_extracted: int = Field(..., ge=0, description="Number of fields extracted")
|
|
confidence_avg: float = Field(
|
|
..., ge=0.0, le=1.0, description="Average extraction confidence"
|
|
)
|
|
calibrated_confidence: float = Field(
|
|
..., ge=0.0, le=1.0, description="Calibrated confidence score"
|
|
)
|
|
model_name: str | None = Field(None, description="LLM model used (if applicable)")
|
|
processing_time_ms: int = Field(
|
|
..., ge=0, description="Processing time in milliseconds"
|
|
)
|
|
storage_path: str = Field(..., description="Path to extraction results")
|
|
|
|
|
|
# Knowledge Graph events
|
|
class KGUpsertReadyEventData(BaseEventData):
|
|
"""Event emitted when KG upsert data is ready."""
|
|
|
|
doc_id: str = Field(..., description="Source document identifier")
|
|
entity_count: int = Field(..., ge=0, description="Number of entities to upsert")
|
|
relationship_count: int = Field(
|
|
..., ge=0, description="Number of relationships to upsert"
|
|
)
|
|
tax_year: str = Field(..., description="Tax year (e.g., '2024-25')")
|
|
taxpayer_id: str = Field(..., description="Taxpayer identifier")
|
|
normalization_id: str = Field(..., description="Normalization run identifier")
|
|
storage_path: str = Field(..., description="Path to normalized data")
|
|
|
|
|
|
class KGUpsertedEventData(BaseEventData):
|
|
"""Event emitted when KG upsert is complete."""
|
|
|
|
doc_id: str = Field(..., description="Source document identifier")
|
|
entities_created: int = Field(..., ge=0, description="Entities created")
|
|
entities_updated: int = Field(..., ge=0, description="Entities updated")
|
|
relationships_created: int = Field(..., ge=0, description="Relationships created")
|
|
relationships_updated: int = Field(..., ge=0, description="Relationships updated")
|
|
shacl_violations: int = Field(
|
|
..., ge=0, description="Number of SHACL validation violations"
|
|
)
|
|
processing_time_ms: int = Field(
|
|
..., ge=0, description="Processing time in milliseconds"
|
|
)
|
|
success: bool = Field(..., description="Whether upsert was successful")
|
|
error_message: str | None = Field(None, description="Error message if failed")
|
|
|
|
|
|
# RAG events
|
|
class RAGIndexedEventData(BaseEventData):
|
|
"""Event emitted when RAG indexing is complete."""
|
|
|
|
doc_id: str = Field(..., description="Source document identifier")
|
|
collection_name: str = Field(..., description="Qdrant collection name")
|
|
chunks_indexed: int = Field(..., ge=0, description="Number of chunks indexed")
|
|
embedding_model: str = Field(..., description="Embedding model used")
|
|
pii_detected: bool = Field(..., description="Whether PII was detected")
|
|
pii_redacted: bool = Field(..., description="Whether PII was redacted")
|
|
processing_time_ms: int = Field(
|
|
..., ge=0, description="Processing time in milliseconds"
|
|
)
|
|
storage_path: str = Field(..., description="Path to chunked data")
|
|
|
|
|
|
# Calculation events
|
|
class CalculationReadyEventData(BaseEventData):
|
|
"""Event emitted when tax calculation is complete."""
|
|
|
|
taxpayer_id: str = Field(..., description="Taxpayer identifier")
|
|
tax_year: str = Field(..., description="Tax year (e.g., '2024-25')")
|
|
schedule_id: str = Field(..., description="Tax schedule identifier (SA102, SA103)")
|
|
calculation_id: str = Field(..., description="Unique calculation run identifier")
|
|
boxes_computed: int = Field(..., ge=0, description="Number of form boxes computed")
|
|
total_income: float | None = Field(None, description="Total income calculated")
|
|
total_tax: float | None = Field(None, description="Total tax calculated")
|
|
confidence: float = Field(
|
|
..., ge=0.0, le=1.0, description="Calculation confidence score"
|
|
)
|
|
evidence_count: int = Field(
|
|
..., ge=0, description="Number of evidence items supporting calculation"
|
|
)
|
|
processing_time_ms: int = Field(
|
|
..., ge=0, description="Processing time in milliseconds"
|
|
)
|
|
storage_path: str = Field(..., description="Path to calculation results")
|
|
|
|
|
|
# Form events
|
|
class FormFilledEventData(BaseEventData):
|
|
"""Event emitted when PDF form filling is complete."""
|
|
|
|
taxpayer_id: str = Field(..., description="Taxpayer identifier")
|
|
tax_year: str = Field(..., description="Tax year (e.g., '2024-25')")
|
|
form_id: str = Field(..., description="Form identifier (SA100, SA102, etc.)")
|
|
fields_filled: int = Field(..., ge=0, description="Number of fields filled")
|
|
pdf_size_bytes: int = Field(..., ge=0, description="Generated PDF size in bytes")
|
|
storage_path: str = Field(..., description="Path to filled PDF")
|
|
evidence_bundle_path: str | None = Field(
|
|
None, description="Path to evidence bundle ZIP"
|
|
)
|
|
checksum_sha256: str = Field(..., description="PDF checksum for integrity")
|
|
|
|
|
|
# HMRC events
|
|
class HMRCSubmittedEventData(BaseEventData):
|
|
"""Event emitted when HMRC submission is complete."""
|
|
|
|
taxpayer_id: str = Field(..., description="Taxpayer identifier")
|
|
tax_year: str = Field(..., description="Tax year (e.g., '2024-25')")
|
|
submission_id: str = Field(..., description="Unique submission identifier")
|
|
hmrc_reference: str | None = Field(None, description="HMRC submission reference")
|
|
submission_type: Literal["dry_run", "sandbox", "live"] = Field(
|
|
..., description="Submission environment type"
|
|
)
|
|
success: bool = Field(..., description="Whether submission was successful")
|
|
status_code: int | None = Field(None, description="HTTP status code")
|
|
error_message: str | None = Field(None, description="Error message if failed")
|
|
processing_time_ms: int = Field(
|
|
..., ge=0, description="Processing time in milliseconds"
|
|
)
|
|
|
|
|
|
# Review events
|
|
class ReviewRequestedEventData(BaseEventData):
|
|
"""Event emitted when human review is requested."""
|
|
|
|
doc_id: str = Field(..., description="Document identifier")
|
|
review_type: Literal["extraction", "calculation", "submission"] = Field(
|
|
..., description="Type of review needed"
|
|
)
|
|
priority: Literal["low", "medium", "high", "urgent"] = Field(
|
|
..., description="Review priority level"
|
|
)
|
|
reason: str = Field(..., description="Reason for review request")
|
|
assigned_to: str | None = Field(None, description="User assigned to review")
|
|
due_date: str | None = Field(None, description="Review due date (ISO 8601)")
|
|
metadata: dict[str, Any] = Field(
|
|
default_factory=dict, description="Additional review metadata"
|
|
)
|
|
|
|
|
|
class ReviewCompletedEventData(BaseEventData):
|
|
"""Event emitted when human review is completed."""
|
|
|
|
doc_id: str = Field(..., description="Document identifier")
|
|
review_id: str = Field(..., description="Review session identifier")
|
|
reviewer: str = Field(..., description="User who completed review")
|
|
decision: Literal["approved", "rejected", "needs_revision"] = Field(
|
|
..., description="Review decision"
|
|
)
|
|
changes_made: int = Field(..., ge=0, description="Number of changes made")
|
|
comments: str | None = Field(None, description="Reviewer comments")
|
|
review_duration_seconds: int = Field(
|
|
..., ge=0, description="Time spent in review (seconds)"
|
|
)
|
|
|
|
|
|
# Firm sync events
|
|
class FirmSyncCompletedEventData(BaseEventData):
|
|
"""Event emitted when firm database sync is complete."""
|
|
|
|
firm_id: str = Field(..., description="Firm identifier")
|
|
connector_type: str = Field(
|
|
..., description="Connector type (iris, sage, xero, etc.)"
|
|
)
|
|
sync_id: str = Field(..., description="Unique sync run identifier")
|
|
records_synced: int = Field(..., ge=0, description="Number of records synced")
|
|
records_created: int = Field(..., ge=0, description="Records created")
|
|
records_updated: int = Field(..., ge=0, description="Records updated")
|
|
records_failed: int = Field(..., ge=0, description="Records that failed to sync")
|
|
success: bool = Field(..., description="Whether sync was successful")
|
|
error_message: str | None = Field(None, description="Error message if failed")
|
|
processing_time_ms: int = Field(
|
|
..., ge=0, description="Processing time in milliseconds"
|
|
)
|
|
|
|
|
|
# Schema mapping for topic -> data class
|
|
EVENT_SCHEMA_MAP: dict[str, type[BaseEventData]] = {
|
|
"doc.ingested": DocumentIngestedEventData,
|
|
"doc.ocr_ready": DocumentOCRReadyEventData,
|
|
"doc.extracted": DocumentExtractedEventData,
|
|
"kg.upsert.ready": KGUpsertReadyEventData,
|
|
"kg.upserted": KGUpsertedEventData,
|
|
"rag.indexed": RAGIndexedEventData,
|
|
"calc.schedule_ready": CalculationReadyEventData,
|
|
"form.filled": FormFilledEventData,
|
|
"hmrc.submitted": HMRCSubmittedEventData,
|
|
"review.requested": ReviewRequestedEventData,
|
|
"review.completed": ReviewCompletedEventData,
|
|
"firm.sync.completed": FirmSyncCompletedEventData,
|
|
}
|
|
|
|
|
|
def validate_event_data(topic: str, data: dict[str, Any]) -> BaseEventData:
|
|
"""
|
|
Validate event data against the schema for the given topic.
|
|
|
|
Args:
|
|
topic: Event topic name
|
|
data: Raw event data dictionary
|
|
|
|
Returns:
|
|
Validated event data model
|
|
|
|
Raises:
|
|
ValueError: If topic is unknown or validation fails
|
|
"""
|
|
if topic not in EVENT_SCHEMA_MAP:
|
|
raise ValueError(f"Unknown event topic: {topic}")
|
|
|
|
schema_class = EVENT_SCHEMA_MAP[topic]
|
|
return schema_class.model_validate(data)
|
|
|
|
|
|
def get_schema_for_topic(topic: str) -> type[BaseEventData]:
|
|
"""
|
|
Get the Pydantic schema class for a given topic.
|
|
|
|
Args:
|
|
topic: Event topic name
|
|
|
|
Returns:
|
|
Schema class for the topic
|
|
|
|
Raises:
|
|
ValueError: If topic is unknown
|
|
"""
|
|
if topic not in EVENT_SCHEMA_MAP:
|
|
raise ValueError(f"Unknown event topic: {topic}")
|
|
|
|
return EVENT_SCHEMA_MAP[topic]
|