ai-tax-agent/apps/svc_ocr/main.py

# FILE: apps/svc-ocr/main.py
# OCR and layout extraction using Tesseract, LayoutLM, and document AI

import os

# Import shared libraries
import sys
from datetime import datetime
from typing import Any

import structlog
import ulid
from fastapi import BackgroundTasks, Depends, HTTPException, Request
from fastapi.responses import JSONResponse

sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))

from libs.app_factory import create_app
from libs.config import BaseAppSettings, create_event_bus, create_minio_client
from libs.events import EventBus, EventPayload, EventTopics
from libs.observability import get_metrics, get_tracer, setup_observability
from libs.schemas import ErrorResponse
from libs.security import get_current_user, get_tenant_id
from libs.storage import DocumentStorage, StorageClient

logger = structlog.get_logger()


class OCRSettings(BaseAppSettings):
    """Settings for OCR service"""

    service_name: str = "svc-ocr"

    # OCR configuration
    tesseract_cmd: str = "/usr/bin/tesseract"
    tesseract_config: str = "--oem 3 --psm 6"
    languages: str = "eng"

    # Layout analysis
    layoutlm_model: str = "microsoft/layoutlm-base-uncased"
    confidence_threshold: float = 0.7

    # Processing limits
    max_pages: int = 50
    max_file_size: int = 100 * 1024 * 1024  # 100MB

    # Output configuration
    include_coordinates: bool = True
    include_confidence: bool = True


# Create app and settings
app, settings = create_app(
    service_name="svc-ocr",
    title="Tax Agent OCR Service",
    description="OCR and layout extraction service",
    settings_class=OCRSettings,
)  # fmt: skip

# Global clients
storage_client: StorageClient | None = None
document_storage: DocumentStorage | None = None
event_bus: EventBus | None = None
tracer = get_tracer("svc-ocr")
metrics = get_metrics()


@app.on_event("startup")
async def startup_event() -> None:
    """Initialize service dependencies"""
    global storage_client, document_storage, event_bus

    logger.info("Starting OCR service")

    # Setup observability
    setup_observability(settings)

    # Initialize MinIO client
    minio_client = create_minio_client(settings)
    storage_client = StorageClient(minio_client)
    document_storage = DocumentStorage(storage_client)

    # Initialize event bus
    event_bus = create_event_bus(settings)
    if not event_bus:
        raise HTTPException(status_code=500, detail="Event bus not initialized")

    await event_bus.start()

    # Subscribe to document ingestion events
    await event_bus.subscribe(EventTopics.DOC_INGESTED, _handle_document_ingested)

    logger.info("OCR service started successfully")


@app.on_event("shutdown")
async def shutdown_event() -> None:
    """Cleanup service dependencies"""
    global event_bus

    logger.info("Shutting down OCR service")

    if event_bus:
        await event_bus.stop()

    logger.info("OCR service shutdown complete")


@app.get("/health")
async def health_check() -> dict[str, Any]:
    """Health check endpoint"""
    return {
        "status": "healthy",
        "service": settings.service_name,
        "version": settings.service_version,
        "timestamp": datetime.utcnow().isoformat(),
    }


@app.post("/process/{doc_id}")
async def process_document(
    doc_id: str,
    background_tasks: BackgroundTasks,
    strategy: str = "hybrid",
    current_user: dict[str, Any] = Depends(get_current_user),
    tenant_id: str = Depends(get_tenant_id),
) -> dict[str, Any]:
    """Process document with OCR"""

    with tracer.start_as_current_span("process_document") as span:
        span.set_attribute("doc_id", doc_id)
        span.set_attribute("tenant_id", tenant_id)
        span.set_attribute("strategy", strategy)

        try:
            # Check if document exists
            doc_content = await document_storage.get_document(tenant_id, doc_id)
            if not doc_content:
                raise HTTPException(status_code=404, detail="Document not found")

            # Generate processing ID
            processing_id = str(ulid.new())
            span.set_attribute("processing_id", processing_id)

            # Start background processing
            background_tasks.add_task(
                _process_document_async,
                doc_id,
                tenant_id,
                doc_content,
                strategy,
                processing_id,
                current_user.get("sub", "system"),
            )

            logger.info(
                "OCR processing started", doc_id=doc_id, processing_id=processing_id
            )

            return {
                "processing_id": processing_id,
                "doc_id": doc_id,
                "status": "processing",
                "strategy": strategy,
            }

        except HTTPException:
            raise
        except Exception as e:
            logger.error("Failed to start OCR processing", doc_id=doc_id, error=str(e))
            raise HTTPException(status_code=500, detail="Failed to start processing")


@app.get("/results/{doc_id}")
async def get_ocr_results(
    doc_id: str,
    current_user: dict[str, Any] = Depends(get_current_user),
    tenant_id: str = Depends(get_tenant_id),
) -> dict[str, Any]:
    """Get OCR results for document"""

    with tracer.start_as_current_span("get_ocr_results") as span:
        span.set_attribute("doc_id", doc_id)
        span.set_attribute("tenant_id", tenant_id)

        try:
            # Get OCR results from storage
            ocr_results = await document_storage.get_ocr_result(tenant_id, doc_id)

            if not ocr_results:
                raise HTTPException(status_code=404, detail="OCR results not found")

            return ocr_results

        except HTTPException:
            raise
        except Exception as e:
            logger.error("Failed to get OCR results", doc_id=doc_id, error=str(e))
            raise HTTPException(status_code=500, detail="Failed to get OCR results")


async def _handle_document_ingested(topic: str, payload: EventPayload) -> None:
    """Handle document ingestion events"""
    try:
        data = payload.data
        doc_id = data.get("doc_id")
        tenant_id = data.get("tenant_id")

        if not doc_id or not tenant_id:
            logger.warning("Invalid document ingestion event", data=data)
            return

        # Auto-process PDF documents
        if data.get("content_type") == "application/pdf":
            logger.info("Auto-processing ingested document", doc_id=doc_id)

            # Get document content
            doc_content = await document_storage.get_document(tenant_id, doc_id)
            if doc_content:
                await _process_document_async(
                    doc_id=doc_id,
                    tenant_id=tenant_id,
                    content=doc_content,
                    strategy="hybrid",
                    processing_id=str(ulid.new()),
                    actor=payload.actor,
                )

    except Exception as e:
        logger.error("Failed to handle document ingestion", error=str(e))


async def _process_document_async(
    doc_id: str,
    tenant_id: str,
    content: bytes,
    strategy: str,
    processing_id: str,
    actor: str,
) -> None:
    """Process document asynchronously"""

    with tracer.start_as_current_span("process_document_async") as span:
        span.set_attribute("doc_id", doc_id)
        span.set_attribute("processing_id", processing_id)
        span.set_attribute("strategy", strategy)

        try:
            # Convert PDF to images
            images = await _pdf_to_images(content)

            # Process each page
            pages_data: list[Any] = []
            for page_num, image in enumerate(images, 1):
                page_data = await _process_page(image, page_num, strategy)
                pages_data.append(page_data)

            # Combine results
            ocr_results = {
                "doc_id": doc_id,
                "processing_id": processing_id,
                "strategy": strategy,
                "processed_at": datetime.utcnow().isoformat(),
                "total_pages": len(pages_data),
                "pages": pages_data,
                "metadata": {
                    "confidence_threshold": settings.confidence_threshold,
                    "languages": settings.languages,
                },
            }

            # Store results
            await document_storage.store_ocr_result(tenant_id, doc_id, ocr_results)

            # Update metrics
            metrics.counter("documents_processed_total").labels(
                tenant_id=tenant_id, strategy=strategy
            ).inc()

            metrics.histogram("processing_duration_seconds").labels(
                strategy=strategy
            ).observe(
                datetime.utcnow().timestamp()
                - datetime.fromisoformat(
                    ocr_results["processed_at"].replace("Z", "")
                ).timestamp()
            )

            # Publish completion event
            event_payload = EventPayload(
                data={
                    "doc_id": doc_id,
                    "tenant_id": tenant_id,
                    "processing_id": processing_id,
                    "strategy": strategy,
                    "total_pages": len(pages_data),
                    "ocr_results": ocr_results,
                },
                actor=actor,
                tenant_id=tenant_id,
            )

            await event_bus.publish(EventTopics.DOC_OCR_READY, event_payload)

            logger.info(
                "OCR processing completed", doc_id=doc_id, pages=len(pages_data)
            )

        except Exception as e:
            logger.error("OCR processing failed", doc_id=doc_id, error=str(e))

            # Update error metrics
            metrics.counter("processing_errors_total").labels(
                tenant_id=tenant_id, strategy=strategy, error_type=type(e).__name__
            ).inc()


async def _pdf_to_images(pdf_content: bytes) -> list[bytes]:
    """Convert PDF to images"""
    try:
        import fitz  # PyMuPDF

        # Open PDF
        pdf_doc = fitz.open(stream=pdf_content, filetype="pdf")

        images: list[Any] = []
        for page_num in range(min(len(pdf_doc), settings.max_pages)):
            page = pdf_doc[page_num]

            # Render page to image
            mat = fitz.Matrix(2.0, 2.0)  # 2x zoom for better OCR
            pix = page.get_pixmap(matrix=mat)
            img_data = pix.tobytes("png")

            images.append(img_data)

        pdf_doc.close()
        return images

    except ImportError:
        logger.error("PyMuPDF not available, using fallback")
        return await _pdf_to_images_fallback(pdf_content)
    except Exception as e:
        logger.error("PDF conversion failed", error=str(e))
        raise


async def _pdf_to_images_fallback(pdf_content: bytes) -> list[bytes]:
    """Fallback PDF to images conversion"""
    try:
        from pdf2image import convert_from_bytes

        images = convert_from_bytes(
            pdf_content, dpi=200, first_page=1, last_page=settings.max_pages
        )

        # Convert PIL images to bytes
        image_bytes: list[Any] = []
        for img in images:
            import io

            img_buffer = io.BytesIO()
            img.save(img_buffer, format="PNG")
            image_bytes.append(img_buffer.getvalue())

        return image_bytes

    except ImportError:
        logger.error("pdf2image not available")
        raise Exception("No PDF conversion library available")


async def _process_page(
    image_data: bytes, page_num: int, strategy: str
) -> dict[str, Any]:
    """Process single page with OCR"""

    if strategy == "tesseract":
        return await _process_with_tesseract(image_data, page_num)
    elif strategy == "layoutlm":
        return await _process_with_layoutlm(image_data, page_num)
    elif strategy == "hybrid":
        # Combine both approaches
        tesseract_result = await _process_with_tesseract(image_data, page_num)
        layoutlm_result = await _process_with_layoutlm(image_data, page_num)

        return {
            "page": page_num,
            "strategy": "hybrid",
            "tesseract": tesseract_result,
            "layoutlm": layoutlm_result,
            "text": tesseract_result.get("text", ""),
            "confidence": max(
                tesseract_result.get("confidence", 0),
                layoutlm_result.get("confidence", 0),
            ),
        }
    else:
        raise ValueError(f"Unknown strategy: {strategy}")


async def _process_with_tesseract(image_data: bytes, page_num: int) -> dict[str, Any]:
    """Process page with Tesseract OCR"""
    try:
        import io

        import pytesseract
        from PIL import Image

        # Load image
        image = Image.open(io.BytesIO(image_data))

        # Configure Tesseract
        config = f"{settings.tesseract_config} -l {settings.languages}"

        # Extract text with confidence
        data = pytesseract.image_to_data(
            image, config=config, output_type=pytesseract.Output.DICT
        )

        # Process results
        words: list[Any] = []
        confidences: list[Any] = []

        for i in range(len(data["text"])):
            if int(data["conf"][i]) > 0:  # Valid confidence
                word_data = {
                    "text": data["text"][i],
                    "confidence": int(data["conf"][i]) / 100.0,
                    "bbox": [
                        data["left"][i],
                        data["top"][i],
                        data["left"][i] + data["width"][i],
                        data["top"][i] + data["height"][i],
                    ],
                }
                words.append(word_data)
                confidences.append(word_data["confidence"])

        # Extract full text
        full_text = pytesseract.image_to_string(image, config=config)

        return {
            "page": page_num,
            "strategy": "tesseract",
            "text": full_text.strip(),
            "words": words,
            "confidence": sum(confidences) / len(confidences) if confidences else 0.0,
            "word_count": len(words),
        }

    except ImportError:
        logger.error("pytesseract not available")
        return {
            "page": page_num,
            "strategy": "tesseract",
            "error": "pytesseract not available",
        }
    except Exception as e:
        logger.error("Tesseract processing failed", page=page_num, error=str(e))
        return {"page": page_num, "strategy": "tesseract", "error": str(e)}


async def _process_with_layoutlm(image_data: bytes, page_num: int) -> dict[str, Any]:
    """Process page with LayoutLM"""
    try:
        # This would integrate with LayoutLM model
        # For now, return placeholder
        logger.warning("LayoutLM processing not implemented")

        return {
            "page": page_num,
            "strategy": "layoutlm",
            "text": "",
            "layout_elements": [],
            "confidence": 0.0,
            "error": "Not implemented",
        }

    except Exception as e:
        logger.error("LayoutLM processing failed", page=page_num, error=str(e))
        return {"page": page_num, "strategy": "layoutlm", "error": str(e)}


@app.exception_handler(HTTPException)
async def http_exception_handler(request: Request, exc: HTTPException) -> JSONResponse:
    """Handle HTTP exceptions with RFC7807 format"""
    return JSONResponse(
        status_code=exc.status_code,
        content=ErrorResponse(
            type=f"https://httpstatuses.com/{exc.status_code}",
            title=exc.detail,
            status=exc.status_code,
            detail=exc.detail,
            instance=str(request.url),
            trace_id="",
        ).model_dump(),
    )


if __name__ == "__main__":
    import uvicorn

    uvicorn.run("main:app", host="0.0.0.0", port=8002, reload=True, log_config=None)