# FILE: apps/svc-ocr/main.py # OCR and layout extraction using Tesseract, LayoutLM, and document AI import asyncio import io import os # Import shared libraries import sys from contextlib import asynccontextmanager from datetime import datetime from typing import Any, cast import pytesseract import structlog import ulid from fastapi import BackgroundTasks, Depends, FastAPI, HTTPException, Request from fastapi.responses import JSONResponse from pdf2image import convert_from_bytes from PIL import Image from PyPDF2 import PdfReader sys.path.append(os.path.join(os.path.dirname(__file__), "..", "..")) from libs.app_factory import create_app from libs.config import BaseAppSettings, create_event_bus, create_minio_client from libs.events import EventBus, EventPayload, EventTopics from libs.observability import get_metrics, get_tracer, setup_observability from libs.ocr.processor import OCRProcessor from libs.schemas import ErrorResponse from libs.security import get_current_user, get_tenant_id from libs.storage import DocumentStorage, StorageClient logger = structlog.get_logger() class OCRSettings(BaseAppSettings): """Settings for OCR service""" service_name: str = "svc-ocr" # OCR configuration tesseract_cmd: str = "/usr/bin/tesseract" tesseract_config: str = "--oem 3 --psm 6" languages: str = "eng" # Layout analysis layoutlm_model: str = "microsoft/layoutlm-base-uncased" confidence_threshold: float = 0.7 # Processing limits max_pages: int = 50 max_file_size: int = 100 * 1024 * 1024 # 100MB # Output configuration include_coordinates: bool = True include_confidence: bool = True # Vision/LLM OCR configuration vision_provider: str = "ollama" # or "openai" vision_model: str = "llama3.2-vision:11b" vision_format: str = ( "text" # text | markdown | json | table | key_value | structured ) vision_preprocess: bool = True openai_base_url: str = "https://api.openai.com/v1/chat/completions" # Global clients storage_client: StorageClient | None = None document_storage: DocumentStorage | None = None event_bus: EventBus | None = None vision_processor: OCRProcessor | None = None # Settings will be initialized after app creation settings: OCRSettings async def init_dependencies(app_settings: OCRSettings) -> None: """Initialize service dependencies""" global storage_client, document_storage, event_bus, settings, vision_processor # Larger delay to ensure NATS is fully ready before attempting connection await asyncio.sleep(10) settings = app_settings logger.info("Starting OCR service") # Setup observability setup_observability(settings) # Initialize MinIO client minio_client = create_minio_client(settings) storage_client = StorageClient(minio_client) document_storage = DocumentStorage(storage_client) # Initialize event bus with retry logic max_retries = 20 delay = 5 for attempt in range(1, max_retries + 1): logger.info( "Attempting NATS connection", url=settings.nats_servers, attempt=attempt ) event_bus = create_event_bus(settings) if not event_bus: raise HTTPException(status_code=500, detail="Event bus not initialized") eb = event_bus try: # Attempt to start and subscribe await eb.start() await eb.subscribe(EventTopics.DOC_INGESTED, _handle_document_ingested) logger.info("NATS connection established on attempt", attempt=attempt) break except Exception as e: logger.error( "Failed to connect to NATS, retrying", attempt=attempt, error=str(e), ) if attempt == max_retries: raise HTTPException( status_code=500, detail="Failed to connect to NATS after retries" ) from e await asyncio.sleep(delay) delay *= 2 # exponential backoff # Initialize shared OCRProcessor for vision strategy try: vision_processor = OCRProcessor( model_name=settings.vision_model, provider=settings.vision_provider, openai_base_url=settings.openai_base_url, ) except Exception as e: logger.error("Failed to initialize vision OCR processor", error=str(e)) logger.info("OCR service started successfully") async def shutdown_dependencies() -> None: """Shutdown service dependencies""" logger.info("Shutting down OCR service") eb = event_bus if eb is not None: await eb.stop() logger.info("OCR service shutdown complete") @asynccontextmanager async def lifespan(app: FastAPI): # type: ignore """FastAPI lifespan event handler""" # Startup await init_dependencies(cast(OCRSettings, _settings)) yield # Shutdown await shutdown_dependencies() # Create app and settings with lifespan app, _settings = create_app( service_name="svc-ocr", title="Tax Agent OCR Service", description="OCR and layout extraction service", settings_class=OCRSettings, ) # fmt: skip # Override app's lifespan app.router.lifespan_context = lifespan tracer = get_tracer("svc-ocr") metrics = get_metrics() @app.post("/process/{doc_id}") async def process_document( doc_id: str, background_tasks: BackgroundTasks, strategy: str = "hybrid", current_user: dict[str, Any] = Depends(get_current_user), tenant_id: str = Depends(get_tenant_id), ) -> dict[str, Any]: """Process document with OCR""" with tracer.start_as_current_span("process_document") as span: span.set_attribute("doc_id", doc_id) span.set_attribute("tenant_id", tenant_id) span.set_attribute("strategy", strategy) ds = document_storage if ds is None: raise HTTPException( status_code=500, detail="Document storage not initialized" ) try: # Check if document exists doc_content = await ds.get_document(tenant_id, doc_id) if not doc_content: raise HTTPException(status_code=404, detail="Document not found") # Generate processing ID processing_id = str(ulid.new()) span.set_attribute("processing_id", processing_id) # Start background processing via sync wrapper (for mypy correctness) background_tasks.add_task( _schedule_process_document_async, doc_id, tenant_id, doc_content, strategy, processing_id, current_user.get("sub", "system"), ) logger.info( "OCR processing started", doc_id=doc_id, processing_id=processing_id ) return { "processing_id": processing_id, "doc_id": doc_id, "status": "processing", "strategy": strategy, } except HTTPException: raise except Exception as e: logger.error("Failed to start OCR processing", doc_id=doc_id, error=str(e)) raise HTTPException( status_code=500, detail="Failed to start processing" ) from e @app.get("/results/{doc_id}") async def get_ocr_results( doc_id: str, current_user: dict[str, Any] = Depends(get_current_user), tenant_id: str = Depends(get_tenant_id), ) -> dict[str, Any]: """Get OCR results for document""" with tracer.start_as_current_span("get_ocr_results") as span: span.set_attribute("doc_id", doc_id) span.set_attribute("tenant_id", tenant_id) ds = document_storage if ds is None: raise HTTPException( status_code=500, detail="Document storage not initialized" ) try: # Get OCR results from storage ocr_results = await ds.get_ocr_result(tenant_id, doc_id) if not ocr_results: raise HTTPException(status_code=404, detail="OCR results not found") return ocr_results except HTTPException: raise except Exception as e: logger.error("Failed to get OCR results", doc_id=doc_id, error=str(e)) raise HTTPException( status_code=500, detail="Failed to get OCR results" ) from e async def _handle_document_ingested(topic: str, payload: EventPayload) -> None: """Handle document ingestion events""" data = payload.data doc_id = data.get("doc_id") tenant_id = data.get("tenant_id") if not doc_id or not tenant_id: logger.warning("Invalid document ingestion event", data=data) return ds = document_storage if ds is None: logger.error("Document storage not initialized") return # Auto-process PDF documents if data.get("mime_type") == "application/pdf": logger.info("Auto-processing ingested document", doc_id=doc_id) try: # Get document content doc_content = await ds.get_document(tenant_id, doc_id) if doc_content: await _process_document_async( doc_id=doc_id, tenant_id=tenant_id, content=doc_content, strategy="hybrid", processing_id=str(ulid.new()), actor=payload.actor, ) except Exception as e: logger.error( "Failed to handle document ingestion", doc_id=doc_id, error=str(e) ) async def _process_document_async( doc_id: str, tenant_id: str, content: bytes, strategy: str, processing_id: str, actor: str, ) -> None: """Process document asynchronously""" with tracer.start_as_current_span("process_document_async") as span: span.set_attribute("doc_id", doc_id) span.set_attribute("processing_id", processing_id) span.set_attribute("strategy", strategy) try: # Convert PDF to images images = await _pdf_to_images(content) # Process each page pages_data: list[dict[str, Any]] = [] for page_num, image in enumerate(images, 0): page_data = await _process_page(image, page_num, strategy) pages_data.append(page_data) # Combine results ocr_results = { "doc_id": doc_id, "processing_id": processing_id, "strategy": strategy, "processed_at": datetime.utcnow().isoformat(), "total_pages": len(pages_data), "pages": pages_data, "metadata": { "confidence_threshold": settings.confidence_threshold, "languages": settings.languages, }, } # Store results ds = document_storage if ds is None: raise RuntimeError("Document storage not initialized") await ds.store_ocr_result(tenant_id, doc_id, ocr_results) # Update metrics metrics.counter( "ocr_documents_processed_total", labelnames=["tenant_id", "strategy"] ).labels(tenant_id=tenant_id, strategy=strategy).inc() metrics.histogram( "ocr_processing_duration_seconds", labelnames=["strategy"] ).labels(strategy=strategy).observe( datetime.utcnow().timestamp() - datetime.fromisoformat( ocr_results["processed_at"].replace("Z", "") # type: ignore ).timestamp() ) # Publish completion event event_payload = EventPayload( data={ "doc_id": doc_id, "tenant_id": tenant_id, "processing_id": processing_id, "strategy": strategy, "total_pages": len(pages_data), "ocr_results": ocr_results, }, actor=actor, tenant_id=tenant_id, ) eb = event_bus if eb is not None: await eb.publish(EventTopics.DOC_OCR_READY, event_payload) logger.info( "OCR processing completed", doc_id=doc_id, pages=len(pages_data) ) except Exception as e: logger.error("OCR processing failed", doc_id=doc_id, error=str(e)) # Update error metrics metrics.counter( "ocr_processing_errors_total", labelnames=["tenant_id", "strategy", "error_type"], ).labels( tenant_id=tenant_id, strategy=strategy, error_type=type(e).__name__ ).inc() async def _pdf_to_images(pdf_content: bytes) -> list[bytes]: """Convert PDF to page images without PyMuPDF. Primary: pdf2image (requires poppler). Fallback: extract largest embedded image per page via PyPDF2/Pillow. """ # First try pdf2image for full-page rasterization try: images = convert_from_bytes( pdf_content, dpi=200, first_page=1, last_page=settings.max_pages ) image_bytes: list[bytes] = [] for img in images: img_buffer = io.BytesIO() img.save(img_buffer, format="PNG") image_bytes.append(img_buffer.getvalue()) return image_bytes except Exception as e: logger.warning( "pdf2image conversion failed; falling back to PyPDF2", error=str(e) ) # Fallback: extract largest embedded image per page using PyPDF2 try: reader = PdfReader(io.BytesIO(pdf_content)) out_images: list[bytes] = [] for page_index, page in enumerate(reader.pages): if page_index >= settings.max_pages: break try: resources = page.get("/Resources") if resources is None: continue xobject = resources.get("/XObject") if xobject is None: continue xobject = xobject.get_object() largest = None largest_area = -1 for _, obj_ref in xobject.items(): try: obj = obj_ref.get_object() if obj.get("/Subtype") != "/Image": continue width = int(obj.get("/Width", 0)) height = int(obj.get("/Height", 0)) area = width * height if area > largest_area: largest = obj largest_area = area except Exception: continue if largest is None: continue data = largest.get_data() filt = largest.get("/Filter") if filt in ("/DCTDecode", "/JPXDecode"): # JPEG or JPEG2000 out_images.append(data) else: # Flate or other; decode via Pillow mode = "RGB" colorspace = largest.get("/ColorSpace") if colorspace in ("/DeviceGray",): mode = "L" width = int(largest.get("/Width", 0)) height = int(largest.get("/Height", 0)) try: img = Image.frombytes(mode, (width, height), data) except Exception: img = Image.open(io.BytesIO(data)) buf = io.BytesIO() img.save(buf, format="PNG") out_images.append(buf.getvalue()) except Exception: continue if not out_images: raise RuntimeError("No images extracted via PyPDF2 fallback") return out_images except Exception as fallback_e: logger.error("PDF conversion failed (both methods)", error=str(fallback_e)) raise async def _process_page( image_data: bytes, page_num: int, strategy: str ) -> dict[str, Any]: """Process single page with OCR""" if strategy == "tesseract": return await _process_with_tesseract(image_data, page_num) elif strategy == "layoutlm": return await _process_with_layoutlm(image_data, page_num) elif strategy == "hybrid": # Combine both approaches tesseract_result = await _process_with_tesseract(image_data, page_num) layoutlm_result = await _process_with_layoutlm(image_data, page_num) return { "page": page_num, "strategy": "hybrid", "tesseract": tesseract_result, "layoutlm": layoutlm_result, "text": tesseract_result.get("text", ""), "confidence": max( tesseract_result.get("confidence", 0), layoutlm_result.get("confidence", 0), ), } elif strategy == "vision": return await _process_with_vision(image_data, page_num) else: raise ValueError(f"Unknown strategy: {strategy}") async def _process_with_tesseract(image_data: bytes, page_num: int) -> dict[str, Any]: """Process page with Tesseract OCR""" try: # Load image image = Image.open(io.BytesIO(image_data)) # Configure Tesseract config = f"{settings.tesseract_config} -l {settings.languages}" # Extract text with confidence data = pytesseract.image_to_data( image, config=config, output_type=pytesseract.Output.DICT ) # Process results words: list[dict[str, Any]] = [] confidences: list[float] = [] for i in range(len(data["text"])): if int(data["conf"][i]) > 0: # Valid confidence word_data = { "text": data["text"][i], "confidence": int(data["conf"][i]) / 100.0, "bbox": [ data["left"][i], data["top"][i], data["left"][i] + data["width"][i], data["top"][i] + data["height"][i], ], } words.append(word_data) confidences.append(word_data["confidence"]) # Extract full text full_text = pytesseract.image_to_string(image, config=config) return { "page": page_num, "strategy": "tesseract", "text": full_text.strip(), "words": words, "confidence": sum(confidences) / len(confidences) if confidences else 0.0, "word_count": len(words), } except Exception as e: logger.error("Tesseract processing failed", page=page_num, error=str(e)) return {"page": page_num, "strategy": "tesseract", "error": str(e)} async def _process_with_layoutlm(image_data: bytes, page_num: int) -> dict[str, Any]: """Process page with LayoutLM""" try: # This would integrate with LayoutLM model # For now, return placeholder logger.warning("LayoutLM processing not implemented") return { "page": page_num, "strategy": "layoutlm", "text": "", "layout_elements": [], "confidence": 0.0, "error": "Not implemented", } except Exception as e: logger.error("LayoutLM processing failed", page=page_num, error=str(e)) return {"page": page_num, "strategy": "layoutlm", "error": str(e)} async def _process_with_vision(image_data: bytes, page_num: int) -> dict[str, Any]: """Process page with LLM vision OCR via shared OCRProcessor""" try: vp = vision_processor if vp is None: raise RuntimeError("Vision OCR processor not initialized") # Persist the page image temporarily for the processor API import tempfile with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp: tmp.write(image_data) tmp_path = tmp.name try: text = vp.process_image( image_path=tmp_path, format_type=settings.vision_format, preprocess=settings.vision_preprocess, language=settings.languages, ) finally: try: os.remove(tmp_path) except OSError: pass return { "page": page_num, "strategy": "vision", "text": text if isinstance(text, str) else str(text), "confidence": 0.0, # Not provided by LLM API } except Exception as e: logger.error("Vision processing failed", page=page_num, error=str(e)) return {"page": page_num, "strategy": "vision", "error": str(e)} def _schedule_process_document_async( doc_id: str, tenant_id: str, content: bytes, strategy: str, processing_id: str, actor: str, ) -> None: """Sync wrapper to schedule the async OCR task. This keeps FastAPI BackgroundTasks type expectations satisfied under mypy strict. """ asyncio.create_task( _process_document_async( doc_id=doc_id, tenant_id=tenant_id, content=content, strategy=strategy, processing_id=processing_id, actor=actor, ) ) @app.exception_handler(HTTPException) async def http_exception_handler(request: Request, exc: HTTPException) -> JSONResponse: """Handle HTTP exceptions with RFC7807 format""" return JSONResponse( status_code=exc.status_code, content=ErrorResponse( type=f"https://httpstatuses.com/{exc.status_code}", title=exc.detail, status=exc.status_code, detail=exc.detail, instance=str(request.url), trace_id="", ).model_dump(), ) if __name__ == "__main__": import uvicorn uvicorn.run("main:app", host="0.0.0.0", port=8002, reload=True, log_config=None)