full ingestion -> OCR -> extraction flow is now working correctly.
Some checks failed
CI/CD Pipeline / Build Docker Images (svc-rpa) (push) Has been cancelled
CI/CD Pipeline / Code Quality & Linting (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Policy Validation (push) Has been cancelled
CI/CD Pipeline / Test Suite (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-firm-connectors) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-forms) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-hmrc) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ingestion) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-normalize-map) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ocr) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-indexer) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-reason) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (ui-review) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (ui-review) (push) Has been cancelled
CI/CD Pipeline / Generate SBOM (push) Has been cancelled
CI/CD Pipeline / Deploy to Staging (push) Has been cancelled
CI/CD Pipeline / Deploy to Production (push) Has been cancelled
CI/CD Pipeline / Notifications (push) Has been cancelled

This commit is contained in:
harkon
2025-11-26 15:46:59 +00:00
parent fdba81809f
commit db61b05c80
17 changed files with 170 additions and 553 deletions

View File

@@ -2,7 +2,7 @@
# FILE: libs/app_factory.py
from collections.abc import AsyncIterator
from collections.abc import AsyncIterator, Awaitable, Callable
from contextlib import asynccontextmanager
from typing import Any
@@ -36,6 +36,8 @@ def create_app( # pylint: disable=too-many-arguments,too-many-positional-argume
version: str = "1.0.0",
settings_class: type[BaseAppSettings] = BaseAppSettings,
custom_settings: dict[str, Any] | None = None,
startup_hooks: list[Callable[[], Awaitable[None]]] | None = None,
shutdown_hooks: list[Callable[[], Awaitable[None]]] | None = None,
) -> tuple[FastAPI, BaseAppSettings]:
"""Create a FastAPI application with standard configuration"""
@@ -56,8 +58,14 @@ def create_app( # pylint: disable=too-many-arguments,too-many-positional-argume
) -> AsyncIterator[None]: # pylint: disable=unused-argument
# Startup
setup_observability(settings)
if startup_hooks:
for hook in startup_hooks:
await hook()
yield
# Shutdown
if shutdown_hooks:
for hook in shutdown_hooks:
await hook()
# Create FastAPI app
app = FastAPI(

View File

@@ -4,15 +4,15 @@
class EventTopics: # pylint: disable=too-few-public-methods
"""Standard event topic names"""
DOC_INGESTED = "doc.ingested"
DOC_OCR_READY = "doc.ocr_ready"
DOC_EXTRACTED = "doc.extracted"
KG_UPSERT_READY = "kg.upsert.ready"
KG_UPSERTED = "kg.upserted"
RAG_INDEXED = "rag.indexed"
CALC_SCHEDULE_READY = "calc.schedule_ready"
FORM_FILLED = "form.filled"
HMRC_SUBMITTED = "hmrc.submitted"
REVIEW_REQUESTED = "review.requested"
REVIEW_COMPLETED = "review.completed"
FIRM_SYNC_COMPLETED = "firm.sync.completed"
DOC_INGESTED = "doc_ingested"
DOC_OCR_READY = "doc_ocr_ready"
DOC_EXTRACTED = "doc_extracted"
KG_UPSERT_READY = "kg_upsert_ready"
KG_UPSERTED = "kg_upserted"
RAG_INDEXED = "rag_indexed"
CALC_SCHEDULE_READY = "calc_schedule_ready"
FORM_FILLED = "form_filled"
HMRC_SUBMITTED = "hmrc_submitted"
REVIEW_REQUESTED = "review_requested"
REVIEW_COMPLETED = "review_completed"
FIRM_SYNC_COMPLETED = "firm_sync_completed"

View File

@@ -11,7 +11,7 @@ psycopg2-binary>=2.9.11
neo4j>=6.0.2
redis[hiredis]>=6.4.0
minio>=7.2.18
minio==7.2.18
boto3>=1.34.0
qdrant-client>=1.15.1

View File

@@ -72,22 +72,23 @@ class DocumentExtractedEventData(BaseEventData):
"""Event emitted when field extraction is complete."""
doc_id: str = Field(..., description="Document identifier")
tenant_id: str = Field(..., description="Tenant identifier")
extraction_id: str = Field(..., description="Unique extraction run identifier")
strategy: Literal["llm", "rules", "hybrid"] = Field(
..., description="Extraction strategy used"
)
fields_extracted: int = Field(..., ge=0, description="Number of fields extracted")
confidence_avg: float = Field(
..., ge=0.0, le=1.0, description="Average extraction confidence"
field_count: int = Field(..., ge=0, description="Number of fields extracted")
confidence: float = Field(
..., ge=0.0, le=1.0, description="Extraction confidence score"
)
calibrated_confidence: float = Field(
..., ge=0.0, le=1.0, description="Calibrated confidence score"
extraction_results: dict[str, Any] = Field(
..., description="Full extraction results including provenance"
)
model_name: str | None = Field(None, description="LLM model used (if applicable)")
processing_time_ms: int = Field(
..., ge=0, description="Processing time in milliseconds"
processing_time_ms: int | None = Field(
None, ge=0, description="Processing time in milliseconds"
)
storage_path: str = Field(..., description="Path to extraction results")
storage_path: str | None = Field(None, description="Path to extraction results")
# Knowledge Graph events

View File

@@ -41,6 +41,11 @@ def get_current_tenant(request: Request) -> str | None:
if role.startswith("tenant:"):
return str(role.split(":", 1)[1])
# Check for explicit tenant header (useful for testing/API keys)
tenant_header = request.headers.get("X-Tenant-ID")
if tenant_header:
return tenant_header
# Default tenant for development
return "default"

View File

@@ -19,17 +19,13 @@ class StorageClient:
async def ensure_bucket(self, bucket_name: str, region: str = "us-east-1") -> bool:
"""Ensure bucket exists, create if not"""
try:
# Check if bucket exists
if self.client.bucket_exists(bucket_name):
logger.debug("Bucket already exists", bucket=bucket_name)
return True
# Create bucket
self.client.make_bucket(bucket_name, location=region)
self.client.make_bucket(bucket_name=bucket_name, location=region)
logger.info("Created bucket", bucket=bucket_name, region=region)
return True
except S3Error as e:
if e.code in ("BucketAlreadyOwnedByYou", "BucketAlreadyExists"):
logger.debug("Bucket already exists", bucket=bucket_name)
return True
logger.error("Failed to ensure bucket", bucket=bucket_name, error=str(e))
return False