ai-tax-agent/apps/svc_ocr/main.py

# FILE: apps/svc-ocr/main.py
# OCR and layout extraction using Tesseract, LayoutLM, and document AI

import asyncio
import io
import os

# Import shared libraries
import sys
from datetime import datetime
from typing import Any, cast

import pytesseract
import structlog
import ulid
from fastapi import BackgroundTasks, Depends, HTTPException, Request
from fastapi.responses import JSONResponse
from pdf2image import convert_from_bytes
from PIL import Image
from PyPDF2 import PdfReader

sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))

from libs.app_factory import create_app
from libs.config import BaseAppSettings, create_event_bus, create_minio_client
from libs.events import EventBus, EventPayload, EventTopics
from libs.observability import get_metrics, get_tracer, setup_observability
from libs.ocr.processor import OCRProcessor
from libs.schemas import ErrorResponse
from libs.security import get_current_user, get_tenant_id
from libs.storage import DocumentStorage, StorageClient

logger = structlog.get_logger()


class OCRSettings(BaseAppSettings):
    """Settings for OCR service"""

    service_name: str = "svc-ocr"

    # OCR configuration
    tesseract_cmd: str = "/usr/bin/tesseract"
    tesseract_config: str = "--oem 3 --psm 6"
    languages: str = "eng"

    # Layout analysis
    layoutlm_model: str = "microsoft/layoutlm-base-uncased"
    confidence_threshold: float = 0.7

    # Processing limits
    max_pages: int = 50
    max_file_size: int = 100 * 1024 * 1024  # 100MB

    # Output configuration
    include_coordinates: bool = True
    include_confidence: bool = True

    # Vision/LLM OCR configuration
    vision_provider: str = "ollama"  # or "openai"
    vision_model: str = "llama3.2-vision:11b"
    vision_format: str = (
        "text"  # text | markdown | json | table | key_value | structured
    )
    vision_preprocess: bool = True
    openai_base_url: str = "https://api.openai.com/v1/chat/completions"


# Global clients
storage_client: StorageClient | None = None
document_storage: DocumentStorage | None = None
event_bus: EventBus | None = None

vision_processor: OCRProcessor | None = None
# Settings will be initialized after app creation
settings: OCRSettings


async def init_dependencies(app_settings: OCRSettings) -> None:
    """Initialize service dependencies"""
    global storage_client, document_storage, event_bus, settings, vision_processor

    settings = app_settings
    logger.info("Starting OCR service")

    # Setup observability
    setup_observability(settings)

    # Initialize MinIO client
    minio_client = create_minio_client(settings)
    storage_client = StorageClient(minio_client)
    document_storage = DocumentStorage(storage_client)
    # Initialize event bus
    event_bus = create_event_bus(settings)
    if not event_bus:
        raise HTTPException(status_code=500, detail="Event bus not initialized")

    eb = event_bus
    # mypy: event_bus is Optional, so use local alias after check
    await eb.start()

    # Subscribe to document ingestion events
    await eb.subscribe(EventTopics.DOC_INGESTED, _handle_document_ingested)

    # Initialize shared OCRProcessor for vision strategy
    try:
        vision_processor = OCRProcessor(
            model_name=settings.vision_model,
            provider=settings.vision_provider,
            openai_base_url=settings.openai_base_url,
        )
    except Exception as e:
        logger.error("Failed to initialize vision OCR processor", error=str(e))

    logger.info("OCR service started successfully")


# Create app and settings
app, _settings = create_app(
    service_name="svc-ocr",
    title="Tax Agent OCR Service",
    description="OCR and layout extraction service",
    settings_class=OCRSettings,
)  # fmt: skip

# Initialize dependencies immediately
asyncio.run(init_dependencies(cast(OCRSettings, _settings)))

tracer = get_tracer("svc-ocr")
metrics = get_metrics()


@app.post("/process/{doc_id}")
async def process_document(
    doc_id: str,
    background_tasks: BackgroundTasks,
    strategy: str = "hybrid",
    current_user: dict[str, Any] = Depends(get_current_user),
    tenant_id: str = Depends(get_tenant_id),
) -> dict[str, Any]:
    """Process document with OCR"""

    with tracer.start_as_current_span("process_document") as span:
        span.set_attribute("doc_id", doc_id)
        span.set_attribute("tenant_id", tenant_id)
        span.set_attribute("strategy", strategy)

        ds = document_storage
        if ds is None:
            raise HTTPException(
                status_code=500, detail="Document storage not initialized"
            )
        try:
            # Check if document exists
            doc_content = await ds.get_document(tenant_id, doc_id)
            if not doc_content:
                raise HTTPException(status_code=404, detail="Document not found")

            # Generate processing ID
            processing_id = str(ulid.new())
            span.set_attribute("processing_id", processing_id)

            # Start background processing via sync wrapper (for mypy correctness)
            background_tasks.add_task(
                _schedule_process_document_async,
                doc_id,
                tenant_id,
                doc_content,
                strategy,
                processing_id,
                current_user.get("sub", "system"),
            )

            logger.info(
                "OCR processing started", doc_id=doc_id, processing_id=processing_id
            )

            return {
                "processing_id": processing_id,
                "doc_id": doc_id,
                "status": "processing",
                "strategy": strategy,
            }

        except HTTPException:
            raise
        except Exception as e:
            logger.error("Failed to start OCR processing", doc_id=doc_id, error=str(e))
            raise HTTPException(
                status_code=500, detail="Failed to start processing"
            ) from e


@app.get("/results/{doc_id}")
async def get_ocr_results(
    doc_id: str,
    current_user: dict[str, Any] = Depends(get_current_user),
    tenant_id: str = Depends(get_tenant_id),
) -> dict[str, Any]:
    """Get OCR results for document"""

    with tracer.start_as_current_span("get_ocr_results") as span:
        span.set_attribute("doc_id", doc_id)
        span.set_attribute("tenant_id", tenant_id)

        ds = document_storage
        if ds is None:
            raise HTTPException(
                status_code=500, detail="Document storage not initialized"
            )
        try:
            # Get OCR results from storage
            ocr_results = await ds.get_ocr_result(tenant_id, doc_id)

            if not ocr_results:
                raise HTTPException(status_code=404, detail="OCR results not found")

            return ocr_results

        except HTTPException:
            raise
        except Exception as e:
            logger.error("Failed to get OCR results", doc_id=doc_id, error=str(e))
            raise HTTPException(
                status_code=500, detail="Failed to get OCR results"
            ) from e


async def _handle_document_ingested(topic: str, payload: EventPayload) -> None:
    """Handle document ingestion events"""
    data = payload.data
    doc_id = data.get("doc_id")
    tenant_id = data.get("tenant_id")

    if not doc_id or not tenant_id:
        logger.warning("Invalid document ingestion event", data=data)
        return
    ds = document_storage
    if ds is None:
        logger.error("Document storage not initialized")
        return

    # Auto-process PDF documents
    if data.get("content_type") == "application/pdf":
        logger.info("Auto-processing ingested document", doc_id=doc_id)

        try:
            # Get document content
            doc_content = await ds.get_document(tenant_id, doc_id)
            if doc_content:
                await _process_document_async(
                    doc_id=doc_id,
                    tenant_id=tenant_id,
                    content=doc_content,
                    strategy="hybrid",
                    processing_id=str(ulid.new()),
                    actor=payload.actor,
                )
        except Exception as e:
            logger.error(
                "Failed to handle document ingestion", doc_id=doc_id, error=str(e)
            )


async def _process_document_async(
    doc_id: str,
    tenant_id: str,
    content: bytes,
    strategy: str,
    processing_id: str,
    actor: str,
) -> None:
    """Process document asynchronously"""

    with tracer.start_as_current_span("process_document_async") as span:
        span.set_attribute("doc_id", doc_id)
        span.set_attribute("processing_id", processing_id)
        span.set_attribute("strategy", strategy)

        try:
            # Convert PDF to images
            images = await _pdf_to_images(content)

            # Process each page
            pages_data: list[dict[str, Any]] = []
            for page_num, image in enumerate(images, 0):
                page_data = await _process_page(image, page_num, strategy)
                pages_data.append(page_data)

            # Combine results
            ocr_results = {
                "doc_id": doc_id,
                "processing_id": processing_id,
                "strategy": strategy,
                "processed_at": datetime.utcnow().isoformat(),
                "total_pages": len(pages_data),
                "pages": pages_data,
                "metadata": {
                    "confidence_threshold": settings.confidence_threshold,
                    "languages": settings.languages,
                },
            }

            # Store results
            ds = document_storage
            if ds is None:
                raise RuntimeError("Document storage not initialized")
            await ds.store_ocr_result(tenant_id, doc_id, ocr_results)

            # Update metrics
            metrics.counter("documents_processed_total").labels(
                tenant_id=tenant_id, strategy=strategy
            ).inc()

            metrics.histogram("processing_duration_seconds").labels(
                strategy=strategy
            ).observe(
                datetime.utcnow().timestamp()
                - datetime.fromisoformat(
                    ocr_results["processed_at"].replace("Z", "")  # type: ignore
                ).timestamp()
            )

            # Publish completion event
            event_payload = EventPayload(
                data={
                    "doc_id": doc_id,
                    "tenant_id": tenant_id,
                    "processing_id": processing_id,
                    "strategy": strategy,
                    "total_pages": len(pages_data),
                    "ocr_results": ocr_results,
                },
                actor=actor,
                tenant_id=tenant_id,
            )

            eb = event_bus
            if eb is not None:
                await eb.publish(EventTopics.DOC_OCR_READY, event_payload)

            logger.info(
                "OCR processing completed", doc_id=doc_id, pages=len(pages_data)
            )

        except Exception as e:
            logger.error("OCR processing failed", doc_id=doc_id, error=str(e))

            # Update error metrics
            metrics.counter("processing_errors_total").labels(
                tenant_id=tenant_id, strategy=strategy, error_type=type(e).__name__
            ).inc()


async def _pdf_to_images(pdf_content: bytes) -> list[bytes]:
    """Convert PDF to page images without PyMuPDF.

    Primary: pdf2image (requires poppler). Fallback: extract largest embedded image per page via PyPDF2/Pillow.
    """
    # First try pdf2image for full-page rasterization
    try:
        images = convert_from_bytes(
            pdf_content, dpi=200, first_page=1, last_page=settings.max_pages
        )
        image_bytes: list[bytes] = []
        for img in images:
            img_buffer = io.BytesIO()
            img.save(img_buffer, format="PNG")
            image_bytes.append(img_buffer.getvalue())
        return image_bytes
    except Exception as e:
        logger.warning(
            "pdf2image conversion failed; falling back to PyPDF2", error=str(e)
        )

    # Fallback: extract largest embedded image per page using PyPDF2
    try:
        reader = PdfReader(io.BytesIO(pdf_content))
        out_images: list[bytes] = []
        for page_index, page in enumerate(reader.pages):
            if page_index >= settings.max_pages:
                break
            try:
                resources = page.get("/Resources")
                if resources is None:
                    continue
                xobject = resources.get("/XObject")
                if xobject is None:
                    continue
                xobject = xobject.get_object()

                largest = None
                largest_area = -1
                for _, obj_ref in xobject.items():
                    try:
                        obj = obj_ref.get_object()
                        if obj.get("/Subtype") != "/Image":
                            continue
                        width = int(obj.get("/Width", 0))
                        height = int(obj.get("/Height", 0))
                        area = width * height
                        if area > largest_area:
                            largest = obj
                            largest_area = area
                    except Exception:
                        continue

                if largest is None:
                    continue

                data = largest.get_data()
                filt = largest.get("/Filter")

                if filt in ("/DCTDecode", "/JPXDecode"):
                    # JPEG or JPEG2000
                    out_images.append(data)
                else:
                    # Flate or other; decode via Pillow
                    mode = "RGB"
                    colorspace = largest.get("/ColorSpace")
                    if colorspace in ("/DeviceGray",):
                        mode = "L"
                    width = int(largest.get("/Width", 0))
                    height = int(largest.get("/Height", 0))
                    try:
                        img = Image.frombytes(mode, (width, height), data)
                    except Exception:
                        img = Image.open(io.BytesIO(data))
                    buf = io.BytesIO()
                    img.save(buf, format="PNG")
                    out_images.append(buf.getvalue())
            except Exception:
                continue

        if not out_images:
            raise RuntimeError("No images extracted via PyPDF2 fallback")
        return out_images
    except Exception as fallback_e:
        logger.error("PDF conversion failed (both methods)", error=str(fallback_e))
        raise


async def _process_page(
    image_data: bytes, page_num: int, strategy: str
) -> dict[str, Any]:
    """Process single page with OCR"""

    if strategy == "tesseract":
        return await _process_with_tesseract(image_data, page_num)
    elif strategy == "layoutlm":
        return await _process_with_layoutlm(image_data, page_num)
    elif strategy == "hybrid":
        # Combine both approaches
        tesseract_result = await _process_with_tesseract(image_data, page_num)
        layoutlm_result = await _process_with_layoutlm(image_data, page_num)

        return {
            "page": page_num,
            "strategy": "hybrid",
            "tesseract": tesseract_result,
            "layoutlm": layoutlm_result,
            "text": tesseract_result.get("text", ""),
            "confidence": max(
                tesseract_result.get("confidence", 0),
                layoutlm_result.get("confidence", 0),
            ),
        }
    elif strategy == "vision":
        return await _process_with_vision(image_data, page_num)
    else:
        raise ValueError(f"Unknown strategy: {strategy}")


async def _process_with_tesseract(image_data: bytes, page_num: int) -> dict[str, Any]:
    """Process page with Tesseract OCR"""
    try:
        # Load image
        image = Image.open(io.BytesIO(image_data))

        # Configure Tesseract
        config = f"{settings.tesseract_config} -l {settings.languages}"

        # Extract text with confidence
        data = pytesseract.image_to_data(  # type: ignore
            image, config=config, output_type=pytesseract.Output.DICT
        )

        # Process results
        words: list[dict[str, Any]] = []
        confidences: list[float] = []

        for i in range(len(data["text"])):
            if int(data["conf"][i]) > 0:  # Valid confidence
                word_data = {
                    "text": data["text"][i],
                    "confidence": int(data["conf"][i]) / 100.0,
                    "bbox": [
                        data["left"][i],
                        data["top"][i],
                        data["left"][i] + data["width"][i],
                        data["top"][i] + data["height"][i],
                    ],
                }
                words.append(word_data)
                confidences.append(word_data["confidence"])

        # Extract full text
        full_text = pytesseract.image_to_string(image, config=config)

        return {
            "page": page_num,
            "strategy": "tesseract",
            "text": full_text.strip(),
            "words": words,
            "confidence": sum(confidences) / len(confidences) if confidences else 0.0,
            "word_count": len(words),
        }

    except Exception as e:
        logger.error("Tesseract processing failed", page=page_num, error=str(e))
        return {"page": page_num, "strategy": "tesseract", "error": str(e)}


async def _process_with_layoutlm(image_data: bytes, page_num: int) -> dict[str, Any]:
    """Process page with LayoutLM"""
    try:
        # This would integrate with LayoutLM model
        # For now, return placeholder
        logger.warning("LayoutLM processing not implemented")

        return {
            "page": page_num,
            "strategy": "layoutlm",
            "text": "",
            "layout_elements": [],
            "confidence": 0.0,
            "error": "Not implemented",
        }

    except Exception as e:
        logger.error("LayoutLM processing failed", page=page_num, error=str(e))
        return {"page": page_num, "strategy": "layoutlm", "error": str(e)}


async def _process_with_vision(image_data: bytes, page_num: int) -> dict[str, Any]:
    """Process page with LLM vision OCR via shared OCRProcessor"""
    try:
        vp = vision_processor
        if vp is None:
            raise RuntimeError("Vision OCR processor not initialized")

        # Persist the page image temporarily for the processor API
        import tempfile

        with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
            tmp.write(image_data)
            tmp_path = tmp.name

        try:
            text = vp.process_image(
                image_path=tmp_path,
                format_type=settings.vision_format,
                preprocess=settings.vision_preprocess,
                language=settings.languages,
            )
        finally:
            try:
                os.remove(tmp_path)
            except OSError:
                pass

        return {
            "page": page_num,
            "strategy": "vision",
            "text": text if isinstance(text, str) else str(text),
            "confidence": 0.0,  # Not provided by LLM API
        }
    except Exception as e:
        logger.error("Vision processing failed", page=page_num, error=str(e))
        return {"page": page_num, "strategy": "vision", "error": str(e)}


def _schedule_process_document_async(
    doc_id: str,
    tenant_id: str,
    content: bytes,
    strategy: str,
    processing_id: str,
    actor: str,
) -> None:
    """Sync wrapper to schedule the async OCR task.

    This keeps FastAPI BackgroundTasks type expectations satisfied under mypy strict.
    """
    asyncio.create_task(
        _process_document_async(
            doc_id=doc_id,
            tenant_id=tenant_id,
            content=content,
            strategy=strategy,
            processing_id=processing_id,
            actor=actor,
        )
    )


@app.exception_handler(HTTPException)
async def http_exception_handler(request: Request, exc: HTTPException) -> JSONResponse:
    """Handle HTTP exceptions with RFC7807 format"""
    return JSONResponse(
        status_code=exc.status_code,
        content=ErrorResponse(
            type=f"https://httpstatuses.com/{exc.status_code}",
            title=exc.detail,
            status=exc.status_code,
            detail=exc.detail,
            instance=str(request.url),
            trace_id="",
        ).model_dump(),
    )


if __name__ == "__main__":
    import uvicorn

    uvicorn.run("main:app", host="0.0.0.0", port=8002, reload=True, log_config=None)