Files
ai-tax-agent/apps/svc_ocr/main.py
harkon b324ff09ef
Some checks failed
CI/CD Pipeline / Code Quality & Linting (push) Has been cancelled
CI/CD Pipeline / Policy Validation (push) Has been cancelled
CI/CD Pipeline / Test Suite (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-firm-connectors) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-forms) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-hmrc) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ingestion) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-normalize-map) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ocr) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-indexer) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-reason) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rpa) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (ui-review) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (ui-review) (push) Has been cancelled
CI/CD Pipeline / Generate SBOM (push) Has been cancelled
CI/CD Pipeline / Deploy to Staging (push) Has been cancelled
CI/CD Pipeline / Deploy to Production (push) Has been cancelled
CI/CD Pipeline / Notifications (push) Has been cancelled
Initial commit
2025-10-11 08:41:36 +01:00

505 lines
16 KiB
Python

# FILE: apps/svc-ocr/main.py
# OCR and layout extraction using Tesseract, LayoutLM, and document AI
import os
# Import shared libraries
import sys
from datetime import datetime
from typing import Any
import structlog
import ulid
from fastapi import BackgroundTasks, Depends, HTTPException, Request
from fastapi.responses import JSONResponse
sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
from libs.app_factory import create_app
from libs.config import BaseAppSettings, create_event_bus, create_minio_client
from libs.events import EventBus, EventPayload, EventTopics
from libs.observability import get_metrics, get_tracer, setup_observability
from libs.schemas import ErrorResponse
from libs.security import get_current_user, get_tenant_id
from libs.storage import DocumentStorage, StorageClient
logger = structlog.get_logger()
class OCRSettings(BaseAppSettings):
"""Settings for OCR service"""
service_name: str = "svc-ocr"
# OCR configuration
tesseract_cmd: str = "/usr/bin/tesseract"
tesseract_config: str = "--oem 3 --psm 6"
languages: str = "eng"
# Layout analysis
layoutlm_model: str = "microsoft/layoutlm-base-uncased"
confidence_threshold: float = 0.7
# Processing limits
max_pages: int = 50
max_file_size: int = 100 * 1024 * 1024 # 100MB
# Output configuration
include_coordinates: bool = True
include_confidence: bool = True
# Create app and settings
app, settings = create_app(
service_name="svc-ocr",
title="Tax Agent OCR Service",
description="OCR and layout extraction service",
settings_class=OCRSettings,
) # fmt: skip
# Global clients
storage_client: StorageClient | None = None
document_storage: DocumentStorage | None = None
event_bus: EventBus | None = None
tracer = get_tracer("svc-ocr")
metrics = get_metrics()
@app.on_event("startup")
async def startup_event() -> None:
"""Initialize service dependencies"""
global storage_client, document_storage, event_bus
logger.info("Starting OCR service")
# Setup observability
setup_observability(settings)
# Initialize MinIO client
minio_client = create_minio_client(settings)
storage_client = StorageClient(minio_client)
document_storage = DocumentStorage(storage_client)
# Initialize event bus
event_bus = create_event_bus(settings)
if not event_bus:
raise HTTPException(status_code=500, detail="Event bus not initialized")
await event_bus.start()
# Subscribe to document ingestion events
await event_bus.subscribe(EventTopics.DOC_INGESTED, _handle_document_ingested)
logger.info("OCR service started successfully")
@app.on_event("shutdown")
async def shutdown_event() -> None:
"""Cleanup service dependencies"""
global event_bus
logger.info("Shutting down OCR service")
if event_bus:
await event_bus.stop()
logger.info("OCR service shutdown complete")
@app.get("/health")
async def health_check() -> dict[str, Any]:
"""Health check endpoint"""
return {
"status": "healthy",
"service": settings.service_name,
"version": settings.service_version,
"timestamp": datetime.utcnow().isoformat(),
}
@app.post("/process/{doc_id}")
async def process_document(
doc_id: str,
background_tasks: BackgroundTasks,
strategy: str = "hybrid",
current_user: dict[str, Any] = Depends(get_current_user),
tenant_id: str = Depends(get_tenant_id),
) -> dict[str, Any]:
"""Process document with OCR"""
with tracer.start_as_current_span("process_document") as span:
span.set_attribute("doc_id", doc_id)
span.set_attribute("tenant_id", tenant_id)
span.set_attribute("strategy", strategy)
try:
# Check if document exists
doc_content = await document_storage.get_document(tenant_id, doc_id)
if not doc_content:
raise HTTPException(status_code=404, detail="Document not found")
# Generate processing ID
processing_id = str(ulid.new())
span.set_attribute("processing_id", processing_id)
# Start background processing
background_tasks.add_task(
_process_document_async,
doc_id,
tenant_id,
doc_content,
strategy,
processing_id,
current_user.get("sub", "system"),
)
logger.info(
"OCR processing started", doc_id=doc_id, processing_id=processing_id
)
return {
"processing_id": processing_id,
"doc_id": doc_id,
"status": "processing",
"strategy": strategy,
}
except HTTPException:
raise
except Exception as e:
logger.error("Failed to start OCR processing", doc_id=doc_id, error=str(e))
raise HTTPException(status_code=500, detail="Failed to start processing")
@app.get("/results/{doc_id}")
async def get_ocr_results(
doc_id: str,
current_user: dict[str, Any] = Depends(get_current_user),
tenant_id: str = Depends(get_tenant_id),
) -> dict[str, Any]:
"""Get OCR results for document"""
with tracer.start_as_current_span("get_ocr_results") as span:
span.set_attribute("doc_id", doc_id)
span.set_attribute("tenant_id", tenant_id)
try:
# Get OCR results from storage
ocr_results = await document_storage.get_ocr_result(tenant_id, doc_id)
if not ocr_results:
raise HTTPException(status_code=404, detail="OCR results not found")
return ocr_results
except HTTPException:
raise
except Exception as e:
logger.error("Failed to get OCR results", doc_id=doc_id, error=str(e))
raise HTTPException(status_code=500, detail="Failed to get OCR results")
async def _handle_document_ingested(topic: str, payload: EventPayload) -> None:
"""Handle document ingestion events"""
try:
data = payload.data
doc_id = data.get("doc_id")
tenant_id = data.get("tenant_id")
if not doc_id or not tenant_id:
logger.warning("Invalid document ingestion event", data=data)
return
# Auto-process PDF documents
if data.get("content_type") == "application/pdf":
logger.info("Auto-processing ingested document", doc_id=doc_id)
# Get document content
doc_content = await document_storage.get_document(tenant_id, doc_id)
if doc_content:
await _process_document_async(
doc_id=doc_id,
tenant_id=tenant_id,
content=doc_content,
strategy="hybrid",
processing_id=str(ulid.new()),
actor=payload.actor,
)
except Exception as e:
logger.error("Failed to handle document ingestion", error=str(e))
async def _process_document_async(
doc_id: str,
tenant_id: str,
content: bytes,
strategy: str,
processing_id: str,
actor: str,
) -> None:
"""Process document asynchronously"""
with tracer.start_as_current_span("process_document_async") as span:
span.set_attribute("doc_id", doc_id)
span.set_attribute("processing_id", processing_id)
span.set_attribute("strategy", strategy)
try:
# Convert PDF to images
images = await _pdf_to_images(content)
# Process each page
pages_data: list[Any] = []
for page_num, image in enumerate(images, 1):
page_data = await _process_page(image, page_num, strategy)
pages_data.append(page_data)
# Combine results
ocr_results = {
"doc_id": doc_id,
"processing_id": processing_id,
"strategy": strategy,
"processed_at": datetime.utcnow().isoformat(),
"total_pages": len(pages_data),
"pages": pages_data,
"metadata": {
"confidence_threshold": settings.confidence_threshold,
"languages": settings.languages,
},
}
# Store results
await document_storage.store_ocr_result(tenant_id, doc_id, ocr_results)
# Update metrics
metrics.counter("documents_processed_total").labels(
tenant_id=tenant_id, strategy=strategy
).inc()
metrics.histogram("processing_duration_seconds").labels(
strategy=strategy
).observe(
datetime.utcnow().timestamp()
- datetime.fromisoformat(
ocr_results["processed_at"].replace("Z", "")
).timestamp()
)
# Publish completion event
event_payload = EventPayload(
data={
"doc_id": doc_id,
"tenant_id": tenant_id,
"processing_id": processing_id,
"strategy": strategy,
"total_pages": len(pages_data),
"ocr_results": ocr_results,
},
actor=actor,
tenant_id=tenant_id,
)
await event_bus.publish(EventTopics.DOC_OCR_READY, event_payload)
logger.info(
"OCR processing completed", doc_id=doc_id, pages=len(pages_data)
)
except Exception as e:
logger.error("OCR processing failed", doc_id=doc_id, error=str(e))
# Update error metrics
metrics.counter("processing_errors_total").labels(
tenant_id=tenant_id, strategy=strategy, error_type=type(e).__name__
).inc()
async def _pdf_to_images(pdf_content: bytes) -> list[bytes]:
"""Convert PDF to images"""
try:
import fitz # PyMuPDF
# Open PDF
pdf_doc = fitz.open(stream=pdf_content, filetype="pdf")
images: list[Any] = []
for page_num in range(min(len(pdf_doc), settings.max_pages)):
page = pdf_doc[page_num]
# Render page to image
mat = fitz.Matrix(2.0, 2.0) # 2x zoom for better OCR
pix = page.get_pixmap(matrix=mat)
img_data = pix.tobytes("png")
images.append(img_data)
pdf_doc.close()
return images
except ImportError:
logger.error("PyMuPDF not available, using fallback")
return await _pdf_to_images_fallback(pdf_content)
except Exception as e:
logger.error("PDF conversion failed", error=str(e))
raise
async def _pdf_to_images_fallback(pdf_content: bytes) -> list[bytes]:
"""Fallback PDF to images conversion"""
try:
from pdf2image import convert_from_bytes
images = convert_from_bytes(
pdf_content, dpi=200, first_page=1, last_page=settings.max_pages
)
# Convert PIL images to bytes
image_bytes: list[Any] = []
for img in images:
import io
img_buffer = io.BytesIO()
img.save(img_buffer, format="PNG")
image_bytes.append(img_buffer.getvalue())
return image_bytes
except ImportError:
logger.error("pdf2image not available")
raise Exception("No PDF conversion library available")
async def _process_page(
image_data: bytes, page_num: int, strategy: str
) -> dict[str, Any]:
"""Process single page with OCR"""
if strategy == "tesseract":
return await _process_with_tesseract(image_data, page_num)
elif strategy == "layoutlm":
return await _process_with_layoutlm(image_data, page_num)
elif strategy == "hybrid":
# Combine both approaches
tesseract_result = await _process_with_tesseract(image_data, page_num)
layoutlm_result = await _process_with_layoutlm(image_data, page_num)
return {
"page": page_num,
"strategy": "hybrid",
"tesseract": tesseract_result,
"layoutlm": layoutlm_result,
"text": tesseract_result.get("text", ""),
"confidence": max(
tesseract_result.get("confidence", 0),
layoutlm_result.get("confidence", 0),
),
}
else:
raise ValueError(f"Unknown strategy: {strategy}")
async def _process_with_tesseract(image_data: bytes, page_num: int) -> dict[str, Any]:
"""Process page with Tesseract OCR"""
try:
import io
import pytesseract
from PIL import Image
# Load image
image = Image.open(io.BytesIO(image_data))
# Configure Tesseract
config = f"{settings.tesseract_config} -l {settings.languages}"
# Extract text with confidence
data = pytesseract.image_to_data(
image, config=config, output_type=pytesseract.Output.DICT
)
# Process results
words: list[Any] = []
confidences: list[Any] = []
for i in range(len(data["text"])):
if int(data["conf"][i]) > 0: # Valid confidence
word_data = {
"text": data["text"][i],
"confidence": int(data["conf"][i]) / 100.0,
"bbox": [
data["left"][i],
data["top"][i],
data["left"][i] + data["width"][i],
data["top"][i] + data["height"][i],
],
}
words.append(word_data)
confidences.append(word_data["confidence"])
# Extract full text
full_text = pytesseract.image_to_string(image, config=config)
return {
"page": page_num,
"strategy": "tesseract",
"text": full_text.strip(),
"words": words,
"confidence": sum(confidences) / len(confidences) if confidences else 0.0,
"word_count": len(words),
}
except ImportError:
logger.error("pytesseract not available")
return {
"page": page_num,
"strategy": "tesseract",
"error": "pytesseract not available",
}
except Exception as e:
logger.error("Tesseract processing failed", page=page_num, error=str(e))
return {"page": page_num, "strategy": "tesseract", "error": str(e)}
async def _process_with_layoutlm(image_data: bytes, page_num: int) -> dict[str, Any]:
"""Process page with LayoutLM"""
try:
# This would integrate with LayoutLM model
# For now, return placeholder
logger.warning("LayoutLM processing not implemented")
return {
"page": page_num,
"strategy": "layoutlm",
"text": "",
"layout_elements": [],
"confidence": 0.0,
"error": "Not implemented",
}
except Exception as e:
logger.error("LayoutLM processing failed", page=page_num, error=str(e))
return {"page": page_num, "strategy": "layoutlm", "error": str(e)}
@app.exception_handler(HTTPException)
async def http_exception_handler(request: Request, exc: HTTPException) -> JSONResponse:
"""Handle HTTP exceptions with RFC7807 format"""
return JSONResponse(
status_code=exc.status_code,
content=ErrorResponse(
type=f"https://httpstatuses.com/{exc.status_code}",
title=exc.detail,
status=exc.status_code,
detail=exc.detail,
instance=str(request.url),
trace_id="",
).model_dump(),
)
if __name__ == "__main__":
import uvicorn
uvicorn.run("main:app", host="0.0.0.0", port=8002, reload=True, log_config=None)