Some checks failed
CI/CD Pipeline / Code Quality & Linting (push) Has been cancelled
CI/CD Pipeline / Policy Validation (push) Has been cancelled
CI/CD Pipeline / Test Suite (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-firm-connectors) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-forms) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-hmrc) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ingestion) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-normalize-map) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ocr) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-indexer) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-reason) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rpa) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (ui-review) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (ui-review) (push) Has been cancelled
CI/CD Pipeline / Generate SBOM (push) Has been cancelled
CI/CD Pipeline / Deploy to Staging (push) Has been cancelled
CI/CD Pipeline / Deploy to Production (push) Has been cancelled
CI/CD Pipeline / Notifications (push) Has been cancelled
505 lines
16 KiB
Python
505 lines
16 KiB
Python
# FILE: apps/svc-ocr/main.py
|
|
# OCR and layout extraction using Tesseract, LayoutLM, and document AI
|
|
|
|
import os
|
|
|
|
# Import shared libraries
|
|
import sys
|
|
from datetime import datetime
|
|
from typing import Any
|
|
|
|
import structlog
|
|
import ulid
|
|
from fastapi import BackgroundTasks, Depends, HTTPException, Request
|
|
from fastapi.responses import JSONResponse
|
|
|
|
sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
|
|
|
|
from libs.app_factory import create_app
|
|
from libs.config import BaseAppSettings, create_event_bus, create_minio_client
|
|
from libs.events import EventBus, EventPayload, EventTopics
|
|
from libs.observability import get_metrics, get_tracer, setup_observability
|
|
from libs.schemas import ErrorResponse
|
|
from libs.security import get_current_user, get_tenant_id
|
|
from libs.storage import DocumentStorage, StorageClient
|
|
|
|
logger = structlog.get_logger()
|
|
|
|
|
|
class OCRSettings(BaseAppSettings):
|
|
"""Settings for OCR service"""
|
|
|
|
service_name: str = "svc-ocr"
|
|
|
|
# OCR configuration
|
|
tesseract_cmd: str = "/usr/bin/tesseract"
|
|
tesseract_config: str = "--oem 3 --psm 6"
|
|
languages: str = "eng"
|
|
|
|
# Layout analysis
|
|
layoutlm_model: str = "microsoft/layoutlm-base-uncased"
|
|
confidence_threshold: float = 0.7
|
|
|
|
# Processing limits
|
|
max_pages: int = 50
|
|
max_file_size: int = 100 * 1024 * 1024 # 100MB
|
|
|
|
# Output configuration
|
|
include_coordinates: bool = True
|
|
include_confidence: bool = True
|
|
|
|
|
|
# Create app and settings
|
|
app, settings = create_app(
|
|
service_name="svc-ocr",
|
|
title="Tax Agent OCR Service",
|
|
description="OCR and layout extraction service",
|
|
settings_class=OCRSettings,
|
|
) # fmt: skip
|
|
|
|
# Global clients
|
|
storage_client: StorageClient | None = None
|
|
document_storage: DocumentStorage | None = None
|
|
event_bus: EventBus | None = None
|
|
tracer = get_tracer("svc-ocr")
|
|
metrics = get_metrics()
|
|
|
|
|
|
@app.on_event("startup")
|
|
async def startup_event() -> None:
|
|
"""Initialize service dependencies"""
|
|
global storage_client, document_storage, event_bus
|
|
|
|
logger.info("Starting OCR service")
|
|
|
|
# Setup observability
|
|
setup_observability(settings)
|
|
|
|
# Initialize MinIO client
|
|
minio_client = create_minio_client(settings)
|
|
storage_client = StorageClient(minio_client)
|
|
document_storage = DocumentStorage(storage_client)
|
|
|
|
# Initialize event bus
|
|
event_bus = create_event_bus(settings)
|
|
if not event_bus:
|
|
raise HTTPException(status_code=500, detail="Event bus not initialized")
|
|
|
|
await event_bus.start()
|
|
|
|
# Subscribe to document ingestion events
|
|
await event_bus.subscribe(EventTopics.DOC_INGESTED, _handle_document_ingested)
|
|
|
|
logger.info("OCR service started successfully")
|
|
|
|
|
|
@app.on_event("shutdown")
|
|
async def shutdown_event() -> None:
|
|
"""Cleanup service dependencies"""
|
|
global event_bus
|
|
|
|
logger.info("Shutting down OCR service")
|
|
|
|
if event_bus:
|
|
await event_bus.stop()
|
|
|
|
logger.info("OCR service shutdown complete")
|
|
|
|
|
|
@app.get("/health")
|
|
async def health_check() -> dict[str, Any]:
|
|
"""Health check endpoint"""
|
|
return {
|
|
"status": "healthy",
|
|
"service": settings.service_name,
|
|
"version": settings.service_version,
|
|
"timestamp": datetime.utcnow().isoformat(),
|
|
}
|
|
|
|
|
|
@app.post("/process/{doc_id}")
|
|
async def process_document(
|
|
doc_id: str,
|
|
background_tasks: BackgroundTasks,
|
|
strategy: str = "hybrid",
|
|
current_user: dict[str, Any] = Depends(get_current_user),
|
|
tenant_id: str = Depends(get_tenant_id),
|
|
) -> dict[str, Any]:
|
|
"""Process document with OCR"""
|
|
|
|
with tracer.start_as_current_span("process_document") as span:
|
|
span.set_attribute("doc_id", doc_id)
|
|
span.set_attribute("tenant_id", tenant_id)
|
|
span.set_attribute("strategy", strategy)
|
|
|
|
try:
|
|
# Check if document exists
|
|
doc_content = await document_storage.get_document(tenant_id, doc_id)
|
|
if not doc_content:
|
|
raise HTTPException(status_code=404, detail="Document not found")
|
|
|
|
# Generate processing ID
|
|
processing_id = str(ulid.new())
|
|
span.set_attribute("processing_id", processing_id)
|
|
|
|
# Start background processing
|
|
background_tasks.add_task(
|
|
_process_document_async,
|
|
doc_id,
|
|
tenant_id,
|
|
doc_content,
|
|
strategy,
|
|
processing_id,
|
|
current_user.get("sub", "system"),
|
|
)
|
|
|
|
logger.info(
|
|
"OCR processing started", doc_id=doc_id, processing_id=processing_id
|
|
)
|
|
|
|
return {
|
|
"processing_id": processing_id,
|
|
"doc_id": doc_id,
|
|
"status": "processing",
|
|
"strategy": strategy,
|
|
}
|
|
|
|
except HTTPException:
|
|
raise
|
|
except Exception as e:
|
|
logger.error("Failed to start OCR processing", doc_id=doc_id, error=str(e))
|
|
raise HTTPException(status_code=500, detail="Failed to start processing")
|
|
|
|
|
|
@app.get("/results/{doc_id}")
|
|
async def get_ocr_results(
|
|
doc_id: str,
|
|
current_user: dict[str, Any] = Depends(get_current_user),
|
|
tenant_id: str = Depends(get_tenant_id),
|
|
) -> dict[str, Any]:
|
|
"""Get OCR results for document"""
|
|
|
|
with tracer.start_as_current_span("get_ocr_results") as span:
|
|
span.set_attribute("doc_id", doc_id)
|
|
span.set_attribute("tenant_id", tenant_id)
|
|
|
|
try:
|
|
# Get OCR results from storage
|
|
ocr_results = await document_storage.get_ocr_result(tenant_id, doc_id)
|
|
|
|
if not ocr_results:
|
|
raise HTTPException(status_code=404, detail="OCR results not found")
|
|
|
|
return ocr_results
|
|
|
|
except HTTPException:
|
|
raise
|
|
except Exception as e:
|
|
logger.error("Failed to get OCR results", doc_id=doc_id, error=str(e))
|
|
raise HTTPException(status_code=500, detail="Failed to get OCR results")
|
|
|
|
|
|
async def _handle_document_ingested(topic: str, payload: EventPayload) -> None:
|
|
"""Handle document ingestion events"""
|
|
try:
|
|
data = payload.data
|
|
doc_id = data.get("doc_id")
|
|
tenant_id = data.get("tenant_id")
|
|
|
|
if not doc_id or not tenant_id:
|
|
logger.warning("Invalid document ingestion event", data=data)
|
|
return
|
|
|
|
# Auto-process PDF documents
|
|
if data.get("content_type") == "application/pdf":
|
|
logger.info("Auto-processing ingested document", doc_id=doc_id)
|
|
|
|
# Get document content
|
|
doc_content = await document_storage.get_document(tenant_id, doc_id)
|
|
if doc_content:
|
|
await _process_document_async(
|
|
doc_id=doc_id,
|
|
tenant_id=tenant_id,
|
|
content=doc_content,
|
|
strategy="hybrid",
|
|
processing_id=str(ulid.new()),
|
|
actor=payload.actor,
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error("Failed to handle document ingestion", error=str(e))
|
|
|
|
|
|
async def _process_document_async(
|
|
doc_id: str,
|
|
tenant_id: str,
|
|
content: bytes,
|
|
strategy: str,
|
|
processing_id: str,
|
|
actor: str,
|
|
) -> None:
|
|
"""Process document asynchronously"""
|
|
|
|
with tracer.start_as_current_span("process_document_async") as span:
|
|
span.set_attribute("doc_id", doc_id)
|
|
span.set_attribute("processing_id", processing_id)
|
|
span.set_attribute("strategy", strategy)
|
|
|
|
try:
|
|
# Convert PDF to images
|
|
images = await _pdf_to_images(content)
|
|
|
|
# Process each page
|
|
pages_data: list[Any] = []
|
|
for page_num, image in enumerate(images, 1):
|
|
page_data = await _process_page(image, page_num, strategy)
|
|
pages_data.append(page_data)
|
|
|
|
# Combine results
|
|
ocr_results = {
|
|
"doc_id": doc_id,
|
|
"processing_id": processing_id,
|
|
"strategy": strategy,
|
|
"processed_at": datetime.utcnow().isoformat(),
|
|
"total_pages": len(pages_data),
|
|
"pages": pages_data,
|
|
"metadata": {
|
|
"confidence_threshold": settings.confidence_threshold,
|
|
"languages": settings.languages,
|
|
},
|
|
}
|
|
|
|
# Store results
|
|
await document_storage.store_ocr_result(tenant_id, doc_id, ocr_results)
|
|
|
|
# Update metrics
|
|
metrics.counter("documents_processed_total").labels(
|
|
tenant_id=tenant_id, strategy=strategy
|
|
).inc()
|
|
|
|
metrics.histogram("processing_duration_seconds").labels(
|
|
strategy=strategy
|
|
).observe(
|
|
datetime.utcnow().timestamp()
|
|
- datetime.fromisoformat(
|
|
ocr_results["processed_at"].replace("Z", "")
|
|
).timestamp()
|
|
)
|
|
|
|
# Publish completion event
|
|
event_payload = EventPayload(
|
|
data={
|
|
"doc_id": doc_id,
|
|
"tenant_id": tenant_id,
|
|
"processing_id": processing_id,
|
|
"strategy": strategy,
|
|
"total_pages": len(pages_data),
|
|
"ocr_results": ocr_results,
|
|
},
|
|
actor=actor,
|
|
tenant_id=tenant_id,
|
|
)
|
|
|
|
await event_bus.publish(EventTopics.DOC_OCR_READY, event_payload)
|
|
|
|
logger.info(
|
|
"OCR processing completed", doc_id=doc_id, pages=len(pages_data)
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error("OCR processing failed", doc_id=doc_id, error=str(e))
|
|
|
|
# Update error metrics
|
|
metrics.counter("processing_errors_total").labels(
|
|
tenant_id=tenant_id, strategy=strategy, error_type=type(e).__name__
|
|
).inc()
|
|
|
|
|
|
async def _pdf_to_images(pdf_content: bytes) -> list[bytes]:
|
|
"""Convert PDF to images"""
|
|
try:
|
|
import fitz # PyMuPDF
|
|
|
|
# Open PDF
|
|
pdf_doc = fitz.open(stream=pdf_content, filetype="pdf")
|
|
|
|
images: list[Any] = []
|
|
for page_num in range(min(len(pdf_doc), settings.max_pages)):
|
|
page = pdf_doc[page_num]
|
|
|
|
# Render page to image
|
|
mat = fitz.Matrix(2.0, 2.0) # 2x zoom for better OCR
|
|
pix = page.get_pixmap(matrix=mat)
|
|
img_data = pix.tobytes("png")
|
|
|
|
images.append(img_data)
|
|
|
|
pdf_doc.close()
|
|
return images
|
|
|
|
except ImportError:
|
|
logger.error("PyMuPDF not available, using fallback")
|
|
return await _pdf_to_images_fallback(pdf_content)
|
|
except Exception as e:
|
|
logger.error("PDF conversion failed", error=str(e))
|
|
raise
|
|
|
|
|
|
async def _pdf_to_images_fallback(pdf_content: bytes) -> list[bytes]:
|
|
"""Fallback PDF to images conversion"""
|
|
try:
|
|
from pdf2image import convert_from_bytes
|
|
|
|
images = convert_from_bytes(
|
|
pdf_content, dpi=200, first_page=1, last_page=settings.max_pages
|
|
)
|
|
|
|
# Convert PIL images to bytes
|
|
image_bytes: list[Any] = []
|
|
for img in images:
|
|
import io
|
|
|
|
img_buffer = io.BytesIO()
|
|
img.save(img_buffer, format="PNG")
|
|
image_bytes.append(img_buffer.getvalue())
|
|
|
|
return image_bytes
|
|
|
|
except ImportError:
|
|
logger.error("pdf2image not available")
|
|
raise Exception("No PDF conversion library available")
|
|
|
|
|
|
async def _process_page(
|
|
image_data: bytes, page_num: int, strategy: str
|
|
) -> dict[str, Any]:
|
|
"""Process single page with OCR"""
|
|
|
|
if strategy == "tesseract":
|
|
return await _process_with_tesseract(image_data, page_num)
|
|
elif strategy == "layoutlm":
|
|
return await _process_with_layoutlm(image_data, page_num)
|
|
elif strategy == "hybrid":
|
|
# Combine both approaches
|
|
tesseract_result = await _process_with_tesseract(image_data, page_num)
|
|
layoutlm_result = await _process_with_layoutlm(image_data, page_num)
|
|
|
|
return {
|
|
"page": page_num,
|
|
"strategy": "hybrid",
|
|
"tesseract": tesseract_result,
|
|
"layoutlm": layoutlm_result,
|
|
"text": tesseract_result.get("text", ""),
|
|
"confidence": max(
|
|
tesseract_result.get("confidence", 0),
|
|
layoutlm_result.get("confidence", 0),
|
|
),
|
|
}
|
|
else:
|
|
raise ValueError(f"Unknown strategy: {strategy}")
|
|
|
|
|
|
async def _process_with_tesseract(image_data: bytes, page_num: int) -> dict[str, Any]:
|
|
"""Process page with Tesseract OCR"""
|
|
try:
|
|
import io
|
|
|
|
import pytesseract
|
|
from PIL import Image
|
|
|
|
# Load image
|
|
image = Image.open(io.BytesIO(image_data))
|
|
|
|
# Configure Tesseract
|
|
config = f"{settings.tesseract_config} -l {settings.languages}"
|
|
|
|
# Extract text with confidence
|
|
data = pytesseract.image_to_data(
|
|
image, config=config, output_type=pytesseract.Output.DICT
|
|
)
|
|
|
|
# Process results
|
|
words: list[Any] = []
|
|
confidences: list[Any] = []
|
|
|
|
for i in range(len(data["text"])):
|
|
if int(data["conf"][i]) > 0: # Valid confidence
|
|
word_data = {
|
|
"text": data["text"][i],
|
|
"confidence": int(data["conf"][i]) / 100.0,
|
|
"bbox": [
|
|
data["left"][i],
|
|
data["top"][i],
|
|
data["left"][i] + data["width"][i],
|
|
data["top"][i] + data["height"][i],
|
|
],
|
|
}
|
|
words.append(word_data)
|
|
confidences.append(word_data["confidence"])
|
|
|
|
# Extract full text
|
|
full_text = pytesseract.image_to_string(image, config=config)
|
|
|
|
return {
|
|
"page": page_num,
|
|
"strategy": "tesseract",
|
|
"text": full_text.strip(),
|
|
"words": words,
|
|
"confidence": sum(confidences) / len(confidences) if confidences else 0.0,
|
|
"word_count": len(words),
|
|
}
|
|
|
|
except ImportError:
|
|
logger.error("pytesseract not available")
|
|
return {
|
|
"page": page_num,
|
|
"strategy": "tesseract",
|
|
"error": "pytesseract not available",
|
|
}
|
|
except Exception as e:
|
|
logger.error("Tesseract processing failed", page=page_num, error=str(e))
|
|
return {"page": page_num, "strategy": "tesseract", "error": str(e)}
|
|
|
|
|
|
async def _process_with_layoutlm(image_data: bytes, page_num: int) -> dict[str, Any]:
|
|
"""Process page with LayoutLM"""
|
|
try:
|
|
# This would integrate with LayoutLM model
|
|
# For now, return placeholder
|
|
logger.warning("LayoutLM processing not implemented")
|
|
|
|
return {
|
|
"page": page_num,
|
|
"strategy": "layoutlm",
|
|
"text": "",
|
|
"layout_elements": [],
|
|
"confidence": 0.0,
|
|
"error": "Not implemented",
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error("LayoutLM processing failed", page=page_num, error=str(e))
|
|
return {"page": page_num, "strategy": "layoutlm", "error": str(e)}
|
|
|
|
|
|
@app.exception_handler(HTTPException)
|
|
async def http_exception_handler(request: Request, exc: HTTPException) -> JSONResponse:
|
|
"""Handle HTTP exceptions with RFC7807 format"""
|
|
return JSONResponse(
|
|
status_code=exc.status_code,
|
|
content=ErrorResponse(
|
|
type=f"https://httpstatuses.com/{exc.status_code}",
|
|
title=exc.detail,
|
|
status=exc.status_code,
|
|
detail=exc.detail,
|
|
instance=str(request.url),
|
|
trace_id="",
|
|
).model_dump(),
|
|
)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import uvicorn
|
|
|
|
uvicorn.run("main:app", host="0.0.0.0", port=8002, reload=True, log_config=None)
|