completed local setup with compose
Some checks failed
CI/CD Pipeline / Generate SBOM (push) Has been cancelled
CI/CD Pipeline / Deploy to Staging (push) Has been cancelled
CI/CD Pipeline / Deploy to Production (push) Has been cancelled
CI/CD Pipeline / Code Quality & Linting (push) Has been cancelled
CI/CD Pipeline / Policy Validation (push) Has been cancelled
CI/CD Pipeline / Test Suite (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-firm-connectors) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-forms) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-hmrc) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ingestion) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-normalize-map) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ocr) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-indexer) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-reason) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rpa) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (ui-review) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (ui-review) (push) Has been cancelled
CI/CD Pipeline / Notifications (push) Has been cancelled

This commit is contained in:
harkon
2025-11-26 13:17:17 +00:00
parent 8fe5e62fee
commit fdba81809f
87 changed files with 5610 additions and 3376 deletions

View File

@@ -7,13 +7,14 @@ import os
# Import shared libraries
import sys
from contextlib import asynccontextmanager
from datetime import datetime
from typing import Any, cast
import pytesseract
import structlog
import ulid
from fastapi import BackgroundTasks, Depends, HTTPException, Request
from fastapi import BackgroundTasks, Depends, FastAPI, HTTPException, Request
from fastapi.responses import JSONResponse
from pdf2image import convert_from_bytes
from PIL import Image
@@ -78,6 +79,8 @@ settings: OCRSettings
async def init_dependencies(app_settings: OCRSettings) -> None:
"""Initialize service dependencies"""
global storage_client, document_storage, event_bus, settings, vision_processor
# Larger delay to ensure NATS is fully ready before attempting connection
await asyncio.sleep(10)
settings = app_settings
logger.info("Starting OCR service")
@@ -89,17 +92,35 @@ async def init_dependencies(app_settings: OCRSettings) -> None:
minio_client = create_minio_client(settings)
storage_client = StorageClient(minio_client)
document_storage = DocumentStorage(storage_client)
# Initialize event bus
event_bus = create_event_bus(settings)
if not event_bus:
raise HTTPException(status_code=500, detail="Event bus not initialized")
eb = event_bus
# mypy: event_bus is Optional, so use local alias after check
await eb.start()
# Subscribe to document ingestion events
await eb.subscribe(EventTopics.DOC_INGESTED, _handle_document_ingested)
# Initialize event bus with retry logic
max_retries = 20
delay = 5
for attempt in range(1, max_retries + 1):
logger.info(
"Attempting NATS connection", url=settings.nats_servers, attempt=attempt
)
event_bus = create_event_bus(settings)
if not event_bus:
raise HTTPException(status_code=500, detail="Event bus not initialized")
eb = event_bus
try:
# Attempt to start and subscribe
await eb.start()
await eb.subscribe(EventTopics.DOC_INGESTED, _handle_document_ingested)
logger.info("NATS connection established on attempt", attempt=attempt)
break
except Exception as e:
logger.error(
"Failed to connect to NATS, retrying",
attempt=attempt,
error=str(e),
)
if attempt == max_retries:
raise HTTPException(
status_code=500, detail="Failed to connect to NATS after retries"
)
await asyncio.sleep(delay)
delay *= 2 # exponential backoff
# Initialize shared OCRProcessor for vision strategy
try:
@@ -114,7 +135,26 @@ async def init_dependencies(app_settings: OCRSettings) -> None:
logger.info("OCR service started successfully")
# Create app and settings
async def shutdown_dependencies() -> None:
"""Shutdown service dependencies"""
logger.info("Shutting down OCR service")
eb = event_bus
if eb is not None:
await eb.stop()
logger.info("OCR service shutdown complete")
@asynccontextmanager
async def lifespan(app: FastAPI): # type: ignore
"""FastAPI lifespan event handler"""
# Startup
await init_dependencies(cast(OCRSettings, _settings))
yield
# Shutdown
await shutdown_dependencies()
# Create app and settings with lifespan
app, _settings = create_app(
service_name="svc-ocr",
title="Tax Agent OCR Service",
@@ -122,8 +162,8 @@ app, _settings = create_app(
settings_class=OCRSettings,
) # fmt: skip
# Initialize dependencies immediately
asyncio.run(init_dependencies(cast(OCRSettings, _settings)))
# Override app's lifespan
app.router.lifespan_context = lifespan
tracer = get_tracer("svc-ocr")
metrics = get_metrics()

View File

@@ -14,3 +14,12 @@ opencv-python-headless>=4.12.0.88 # Headless version is smaller
# Computer vision (torchvision not in base-ml)
torchvision>=0.23.0
# OpenTelemetry (required by libs/observability)
opentelemetry-api>=1.21.0
opentelemetry-sdk>=1.21.0
opentelemetry-exporter-otlp-proto-grpc>=1.21.0
opentelemetry-instrumentation-fastapi>=0.42b0
opentelemetry-instrumentation-httpx>=0.42b0
opentelemetry-instrumentation-psycopg2>=0.42b0
opentelemetry-instrumentation-redis>=0.42b0