# FILE: apps/svc-ocr/main.py # OCR and layout extraction using Tesseract, LayoutLM, and document AI import os # Import shared libraries import sys from datetime import datetime from typing import Any import structlog import ulid from fastapi import BackgroundTasks, Depends, HTTPException, Request from fastapi.responses import JSONResponse sys.path.append(os.path.join(os.path.dirname(__file__), "..", "..")) from libs.app_factory import create_app from libs.config import BaseAppSettings, create_event_bus, create_minio_client from libs.events import EventBus, EventPayload, EventTopics from libs.observability import get_metrics, get_tracer, setup_observability from libs.schemas import ErrorResponse from libs.security import get_current_user, get_tenant_id from libs.storage import DocumentStorage, StorageClient logger = structlog.get_logger() class OCRSettings(BaseAppSettings): """Settings for OCR service""" service_name: str = "svc-ocr" # OCR configuration tesseract_cmd: str = "/usr/bin/tesseract" tesseract_config: str = "--oem 3 --psm 6" languages: str = "eng" # Layout analysis layoutlm_model: str = "microsoft/layoutlm-base-uncased" confidence_threshold: float = 0.7 # Processing limits max_pages: int = 50 max_file_size: int = 100 * 1024 * 1024 # 100MB # Output configuration include_coordinates: bool = True include_confidence: bool = True # Create app and settings app, settings = create_app( service_name="svc-ocr", title="Tax Agent OCR Service", description="OCR and layout extraction service", settings_class=OCRSettings, ) # fmt: skip # Global clients storage_client: StorageClient | None = None document_storage: DocumentStorage | None = None event_bus: EventBus | None = None tracer = get_tracer("svc-ocr") metrics = get_metrics() @app.on_event("startup") async def startup_event() -> None: """Initialize service dependencies""" global storage_client, document_storage, event_bus logger.info("Starting OCR service") # Setup observability setup_observability(settings) # Initialize MinIO client minio_client = create_minio_client(settings) storage_client = StorageClient(minio_client) document_storage = DocumentStorage(storage_client) # Initialize event bus event_bus = create_event_bus(settings) if not event_bus: raise HTTPException(status_code=500, detail="Event bus not initialized") await event_bus.start() # Subscribe to document ingestion events await event_bus.subscribe(EventTopics.DOC_INGESTED, _handle_document_ingested) logger.info("OCR service started successfully") @app.on_event("shutdown") async def shutdown_event() -> None: """Cleanup service dependencies""" global event_bus logger.info("Shutting down OCR service") if event_bus: await event_bus.stop() logger.info("OCR service shutdown complete") @app.get("/health") async def health_check() -> dict[str, Any]: """Health check endpoint""" return { "status": "healthy", "service": settings.service_name, "version": settings.service_version, "timestamp": datetime.utcnow().isoformat(), } @app.post("/process/{doc_id}") async def process_document( doc_id: str, background_tasks: BackgroundTasks, strategy: str = "hybrid", current_user: dict[str, Any] = Depends(get_current_user), tenant_id: str = Depends(get_tenant_id), ) -> dict[str, Any]: """Process document with OCR""" with tracer.start_as_current_span("process_document") as span: span.set_attribute("doc_id", doc_id) span.set_attribute("tenant_id", tenant_id) span.set_attribute("strategy", strategy) try: # Check if document exists doc_content = await document_storage.get_document(tenant_id, doc_id) if not doc_content: raise HTTPException(status_code=404, detail="Document not found") # Generate processing ID processing_id = str(ulid.new()) span.set_attribute("processing_id", processing_id) # Start background processing background_tasks.add_task( _process_document_async, doc_id, tenant_id, doc_content, strategy, processing_id, current_user.get("sub", "system"), ) logger.info( "OCR processing started", doc_id=doc_id, processing_id=processing_id ) return { "processing_id": processing_id, "doc_id": doc_id, "status": "processing", "strategy": strategy, } except HTTPException: raise except Exception as e: logger.error("Failed to start OCR processing", doc_id=doc_id, error=str(e)) raise HTTPException(status_code=500, detail="Failed to start processing") @app.get("/results/{doc_id}") async def get_ocr_results( doc_id: str, current_user: dict[str, Any] = Depends(get_current_user), tenant_id: str = Depends(get_tenant_id), ) -> dict[str, Any]: """Get OCR results for document""" with tracer.start_as_current_span("get_ocr_results") as span: span.set_attribute("doc_id", doc_id) span.set_attribute("tenant_id", tenant_id) try: # Get OCR results from storage ocr_results = await document_storage.get_ocr_result(tenant_id, doc_id) if not ocr_results: raise HTTPException(status_code=404, detail="OCR results not found") return ocr_results except HTTPException: raise except Exception as e: logger.error("Failed to get OCR results", doc_id=doc_id, error=str(e)) raise HTTPException(status_code=500, detail="Failed to get OCR results") async def _handle_document_ingested(topic: str, payload: EventPayload) -> None: """Handle document ingestion events""" try: data = payload.data doc_id = data.get("doc_id") tenant_id = data.get("tenant_id") if not doc_id or not tenant_id: logger.warning("Invalid document ingestion event", data=data) return # Auto-process PDF documents if data.get("content_type") == "application/pdf": logger.info("Auto-processing ingested document", doc_id=doc_id) # Get document content doc_content = await document_storage.get_document(tenant_id, doc_id) if doc_content: await _process_document_async( doc_id=doc_id, tenant_id=tenant_id, content=doc_content, strategy="hybrid", processing_id=str(ulid.new()), actor=payload.actor, ) except Exception as e: logger.error("Failed to handle document ingestion", error=str(e)) async def _process_document_async( doc_id: str, tenant_id: str, content: bytes, strategy: str, processing_id: str, actor: str, ) -> None: """Process document asynchronously""" with tracer.start_as_current_span("process_document_async") as span: span.set_attribute("doc_id", doc_id) span.set_attribute("processing_id", processing_id) span.set_attribute("strategy", strategy) try: # Convert PDF to images images = await _pdf_to_images(content) # Process each page pages_data: list[Any] = [] for page_num, image in enumerate(images, 1): page_data = await _process_page(image, page_num, strategy) pages_data.append(page_data) # Combine results ocr_results = { "doc_id": doc_id, "processing_id": processing_id, "strategy": strategy, "processed_at": datetime.utcnow().isoformat(), "total_pages": len(pages_data), "pages": pages_data, "metadata": { "confidence_threshold": settings.confidence_threshold, "languages": settings.languages, }, } # Store results await document_storage.store_ocr_result(tenant_id, doc_id, ocr_results) # Update metrics metrics.counter("documents_processed_total").labels( tenant_id=tenant_id, strategy=strategy ).inc() metrics.histogram("processing_duration_seconds").labels( strategy=strategy ).observe( datetime.utcnow().timestamp() - datetime.fromisoformat( ocr_results["processed_at"].replace("Z", "") ).timestamp() ) # Publish completion event event_payload = EventPayload( data={ "doc_id": doc_id, "tenant_id": tenant_id, "processing_id": processing_id, "strategy": strategy, "total_pages": len(pages_data), "ocr_results": ocr_results, }, actor=actor, tenant_id=tenant_id, ) await event_bus.publish(EventTopics.DOC_OCR_READY, event_payload) logger.info( "OCR processing completed", doc_id=doc_id, pages=len(pages_data) ) except Exception as e: logger.error("OCR processing failed", doc_id=doc_id, error=str(e)) # Update error metrics metrics.counter("processing_errors_total").labels( tenant_id=tenant_id, strategy=strategy, error_type=type(e).__name__ ).inc() async def _pdf_to_images(pdf_content: bytes) -> list[bytes]: """Convert PDF to images""" try: import fitz # PyMuPDF # Open PDF pdf_doc = fitz.open(stream=pdf_content, filetype="pdf") images: list[Any] = [] for page_num in range(min(len(pdf_doc), settings.max_pages)): page = pdf_doc[page_num] # Render page to image mat = fitz.Matrix(2.0, 2.0) # 2x zoom for better OCR pix = page.get_pixmap(matrix=mat) img_data = pix.tobytes("png") images.append(img_data) pdf_doc.close() return images except ImportError: logger.error("PyMuPDF not available, using fallback") return await _pdf_to_images_fallback(pdf_content) except Exception as e: logger.error("PDF conversion failed", error=str(e)) raise async def _pdf_to_images_fallback(pdf_content: bytes) -> list[bytes]: """Fallback PDF to images conversion""" try: from pdf2image import convert_from_bytes images = convert_from_bytes( pdf_content, dpi=200, first_page=1, last_page=settings.max_pages ) # Convert PIL images to bytes image_bytes: list[Any] = [] for img in images: import io img_buffer = io.BytesIO() img.save(img_buffer, format="PNG") image_bytes.append(img_buffer.getvalue()) return image_bytes except ImportError: logger.error("pdf2image not available") raise Exception("No PDF conversion library available") async def _process_page( image_data: bytes, page_num: int, strategy: str ) -> dict[str, Any]: """Process single page with OCR""" if strategy == "tesseract": return await _process_with_tesseract(image_data, page_num) elif strategy == "layoutlm": return await _process_with_layoutlm(image_data, page_num) elif strategy == "hybrid": # Combine both approaches tesseract_result = await _process_with_tesseract(image_data, page_num) layoutlm_result = await _process_with_layoutlm(image_data, page_num) return { "page": page_num, "strategy": "hybrid", "tesseract": tesseract_result, "layoutlm": layoutlm_result, "text": tesseract_result.get("text", ""), "confidence": max( tesseract_result.get("confidence", 0), layoutlm_result.get("confidence", 0), ), } else: raise ValueError(f"Unknown strategy: {strategy}") async def _process_with_tesseract(image_data: bytes, page_num: int) -> dict[str, Any]: """Process page with Tesseract OCR""" try: import io import pytesseract from PIL import Image # Load image image = Image.open(io.BytesIO(image_data)) # Configure Tesseract config = f"{settings.tesseract_config} -l {settings.languages}" # Extract text with confidence data = pytesseract.image_to_data( image, config=config, output_type=pytesseract.Output.DICT ) # Process results words: list[Any] = [] confidences: list[Any] = [] for i in range(len(data["text"])): if int(data["conf"][i]) > 0: # Valid confidence word_data = { "text": data["text"][i], "confidence": int(data["conf"][i]) / 100.0, "bbox": [ data["left"][i], data["top"][i], data["left"][i] + data["width"][i], data["top"][i] + data["height"][i], ], } words.append(word_data) confidences.append(word_data["confidence"]) # Extract full text full_text = pytesseract.image_to_string(image, config=config) return { "page": page_num, "strategy": "tesseract", "text": full_text.strip(), "words": words, "confidence": sum(confidences) / len(confidences) if confidences else 0.0, "word_count": len(words), } except ImportError: logger.error("pytesseract not available") return { "page": page_num, "strategy": "tesseract", "error": "pytesseract not available", } except Exception as e: logger.error("Tesseract processing failed", page=page_num, error=str(e)) return {"page": page_num, "strategy": "tesseract", "error": str(e)} async def _process_with_layoutlm(image_data: bytes, page_num: int) -> dict[str, Any]: """Process page with LayoutLM""" try: # This would integrate with LayoutLM model # For now, return placeholder logger.warning("LayoutLM processing not implemented") return { "page": page_num, "strategy": "layoutlm", "text": "", "layout_elements": [], "confidence": 0.0, "error": "Not implemented", } except Exception as e: logger.error("LayoutLM processing failed", page=page_num, error=str(e)) return {"page": page_num, "strategy": "layoutlm", "error": str(e)} @app.exception_handler(HTTPException) async def http_exception_handler(request: Request, exc: HTTPException) -> JSONResponse: """Handle HTTP exceptions with RFC7807 format""" return JSONResponse( status_code=exc.status_code, content=ErrorResponse( type=f"https://httpstatuses.com/{exc.status_code}", title=exc.detail, status=exc.status_code, detail=exc.detail, instance=str(request.url), trace_id="", ).model_dump(), ) if __name__ == "__main__": import uvicorn uvicorn.run("main:app", host="0.0.0.0", port=8002, reload=True, log_config=None)