deployment, linting and infra configuration

2025-10-14 07:42:31 +01:00
parent f0f7674b8d
commit eea46ac89c
41 changed files with 1017 additions and 1448 deletions
--- a/apps/svc_ocr/main.py
+++ b/apps/svc_ocr/main.py
@@ -1,17 +1,23 @@
 # FILE: apps/svc-ocr/main.py
 # OCR and layout extraction using Tesseract, LayoutLM, and document AI

+import asyncio
+import io
 import os

 # Import shared libraries
 import sys
 from datetime import datetime
-from typing import Any
+from typing import Any, cast

+import pytesseract
 import structlog
 import ulid
 from fastapi import BackgroundTasks, Depends, HTTPException, Request
 from fastapi.responses import JSONResponse
+from pdf2image import convert_from_bytes
+from PIL import Image
+from PyPDF2 import PdfReader

 sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))

@@ -19,6 +25,7 @@ from libs.app_factory import create_app
 from libs.config import BaseAppSettings, create_event_bus, create_minio_client
 from libs.events import EventBus, EventPayload, EventTopics
 from libs.observability import get_metrics, get_tracer, setup_observability
+from libs.ocr.processor import OCRProcessor
 from libs.schemas import ErrorResponse
 from libs.security import get_current_user, get_tenant_id
 from libs.storage import DocumentStorage, StorageClient
@@ -48,28 +55,31 @@ class OCRSettings(BaseAppSettings):
    include_coordinates: bool = True
    include_confidence: bool = True

+    # Vision/LLM OCR configuration
+    vision_provider: str = "ollama"  # or "openai"
+    vision_model: str = "llama3.2-vision:11b"
+    vision_format: str = (
+        "text"  # text | markdown | json | table | key_value | structured
+    )
+    vision_preprocess: bool = True
+    openai_base_url: str = "https://api.openai.com/v1/chat/completions"

-# Create app and settings
-app, settings = create_app(
-    service_name="svc-ocr",
-    title="Tax Agent OCR Service",
-    description="OCR and layout extraction service",
-    settings_class=OCRSettings,
-)  # fmt: skip

 # Global clients
 storage_client: StorageClient | None = None
 document_storage: DocumentStorage | None = None
 event_bus: EventBus | None = None
-tracer = get_tracer("svc-ocr")
-metrics = get_metrics()
+
+vision_processor: OCRProcessor | None = None
+# Settings will be initialized after app creation
+settings: OCRSettings


-@app.on_event("startup")
-async def startup_event() -> None:
+async def init_dependencies(app_settings: OCRSettings) -> None:
    """Initialize service dependencies"""
-    global storage_client, document_storage, event_bus
+    global storage_client, document_storage, event_bus, settings, vision_processor

+    settings = app_settings
    logger.info("Starting OCR service")

    # Setup observability
@@ -79,42 +89,44 @@ async def startup_event() -> None:
    minio_client = create_minio_client(settings)
    storage_client = StorageClient(minio_client)
    document_storage = DocumentStorage(storage_client)
-
    # Initialize event bus
    event_bus = create_event_bus(settings)
    if not event_bus:
        raise HTTPException(status_code=500, detail="Event bus not initialized")

-    await event_bus.start()
+    eb = event_bus
+    # mypy: event_bus is Optional, so use local alias after check
+    await eb.start()

    # Subscribe to document ingestion events
-    await event_bus.subscribe(EventTopics.DOC_INGESTED, _handle_document_ingested)
+    await eb.subscribe(EventTopics.DOC_INGESTED, _handle_document_ingested)
+
+    # Initialize shared OCRProcessor for vision strategy
+    try:
+        vision_processor = OCRProcessor(
+            model_name=settings.vision_model,
+            provider=settings.vision_provider,
+            openai_base_url=settings.openai_base_url,
+        )
+    except Exception as e:
+        logger.error("Failed to initialize vision OCR processor", error=str(e))

    logger.info("OCR service started successfully")


-@app.on_event("shutdown")
-async def shutdown_event() -> None:
-    """Cleanup service dependencies"""
-    global event_bus
+# Create app and settings
+app, _settings = create_app(
+    service_name="svc-ocr",
+    title="Tax Agent OCR Service",
+    description="OCR and layout extraction service",
+    settings_class=OCRSettings,
+)  # fmt: skip

-    logger.info("Shutting down OCR service")
+# Initialize dependencies immediately
+asyncio.run(init_dependencies(cast(OCRSettings, _settings)))

-    if event_bus:
-        await event_bus.stop()
-
-    logger.info("OCR service shutdown complete")
-
-
-@app.get("/health")
-async def health_check() -> dict[str, Any]:
-    """Health check endpoint"""
-    return {
-        "status": "healthy",
-        "service": settings.service_name,
-        "version": settings.service_version,
-        "timestamp": datetime.utcnow().isoformat(),
-    }
+tracer = get_tracer("svc-ocr")
+metrics = get_metrics()


@app.post("/process/{doc_id}")
@@ -132,9 +144,14 @@ async def process_document(
        span.set_attribute("tenant_id", tenant_id)
        span.set_attribute("strategy", strategy)

+        ds = document_storage
+        if ds is None:
+            raise HTTPException(
+                status_code=500, detail="Document storage not initialized"
+            )
        try:
            # Check if document exists
-            doc_content = await document_storage.get_document(tenant_id, doc_id)
+            doc_content = await ds.get_document(tenant_id, doc_id)
            if not doc_content:
                raise HTTPException(status_code=404, detail="Document not found")

@@ -142,9 +159,9 @@ async def process_document(
            processing_id = str(ulid.new())
            span.set_attribute("processing_id", processing_id)

-            # Start background processing
+            # Start background processing via sync wrapper (for mypy correctness)
            background_tasks.add_task(
-                _process_document_async,
+                _schedule_process_document_async,
                doc_id,
                tenant_id,
                doc_content,
@@ -168,7 +185,9 @@ async def process_document(
            raise
        except Exception as e:
            logger.error("Failed to start OCR processing", doc_id=doc_id, error=str(e))
-            raise HTTPException(status_code=500, detail="Failed to start processing")
+            raise HTTPException(
+                status_code=500, detail="Failed to start processing"
+            ) from e


@app.get("/results/{doc_id}")
@@ -183,9 +202,14 @@ async def get_ocr_results(
        span.set_attribute("doc_id", doc_id)
        span.set_attribute("tenant_id", tenant_id)

+        ds = document_storage
+        if ds is None:
+            raise HTTPException(
+                status_code=500, detail="Document storage not initialized"
+            )
        try:
            # Get OCR results from storage
-            ocr_results = await document_storage.get_ocr_result(tenant_id, doc_id)
+            ocr_results = await ds.get_ocr_result(tenant_id, doc_id)

            if not ocr_results:
                raise HTTPException(status_code=404, detail="OCR results not found")
@@ -196,26 +220,32 @@ async def get_ocr_results(
            raise
        except Exception as e:
            logger.error("Failed to get OCR results", doc_id=doc_id, error=str(e))
-            raise HTTPException(status_code=500, detail="Failed to get OCR results")
+            raise HTTPException(
+                status_code=500, detail="Failed to get OCR results"
+            ) from e


 async def _handle_document_ingested(topic: str, payload: EventPayload) -> None:
    """Handle document ingestion events"""
-    try:
-        data = payload.data
-        doc_id = data.get("doc_id")
-        tenant_id = data.get("tenant_id")
+    data = payload.data
+    doc_id = data.get("doc_id")
+    tenant_id = data.get("tenant_id")

-        if not doc_id or not tenant_id:
-            logger.warning("Invalid document ingestion event", data=data)
-            return
+    if not doc_id or not tenant_id:
+        logger.warning("Invalid document ingestion event", data=data)
+        return
+    ds = document_storage
+    if ds is None:
+        logger.error("Document storage not initialized")
+        return

-        # Auto-process PDF documents
-        if data.get("content_type") == "application/pdf":
-            logger.info("Auto-processing ingested document", doc_id=doc_id)
+    # Auto-process PDF documents
+    if data.get("content_type") == "application/pdf":
+        logger.info("Auto-processing ingested document", doc_id=doc_id)

+        try:
            # Get document content
-            doc_content = await document_storage.get_document(tenant_id, doc_id)
+            doc_content = await ds.get_document(tenant_id, doc_id)
            if doc_content:
                await _process_document_async(
                    doc_id=doc_id,
@@ -225,9 +255,10 @@ async def _handle_document_ingested(topic: str, payload: EventPayload) -> None:
                    processing_id=str(ulid.new()),
                    actor=payload.actor,
                )
-
-    except Exception as e:
-        logger.error("Failed to handle document ingestion", error=str(e))
+        except Exception as e:
+            logger.error(
+                "Failed to handle document ingestion", doc_id=doc_id, error=str(e)
+            )


 async def _process_document_async(
@@ -250,8 +281,8 @@ async def _process_document_async(
            images = await _pdf_to_images(content)

            # Process each page
-            pages_data: list[Any] = []
-            for page_num, image in enumerate(images, 1):
+            pages_data: list[dict[str, Any]] = []
+            for page_num, image in enumerate(images, 0):
                page_data = await _process_page(image, page_num, strategy)
                pages_data.append(page_data)

@@ -270,7 +301,10 @@ async def _process_document_async(
            }

            # Store results
-            await document_storage.store_ocr_result(tenant_id, doc_id, ocr_results)
+            ds = document_storage
+            if ds is None:
+                raise RuntimeError("Document storage not initialized")
+            await ds.store_ocr_result(tenant_id, doc_id, ocr_results)

            # Update metrics
            metrics.counter("documents_processed_total").labels(
@@ -282,7 +316,7 @@ async def _process_document_async(
            ).observe(
                datetime.utcnow().timestamp()
                - datetime.fromisoformat(
-                    ocr_results["processed_at"].replace("Z", "")
+                    ocr_results["processed_at"].replace("Z", "")  # type: ignore
                ).timestamp()
            )

@@ -300,7 +334,9 @@ async def _process_document_async(
                tenant_id=tenant_id,
            )

-            await event_bus.publish(EventTopics.DOC_OCR_READY, event_payload)
+            eb = event_bus
+            if eb is not None:
+                await eb.publish(EventTopics.DOC_OCR_READY, event_payload)

            logger.info(
                "OCR processing completed", doc_id=doc_id, pages=len(pages_data)
@@ -316,58 +352,91 @@ async def _process_document_async(


 async def _pdf_to_images(pdf_content: bytes) -> list[bytes]:
-    """Convert PDF to images"""
+    """Convert PDF to page images without PyMuPDF.
+
+    Primary: pdf2image (requires poppler). Fallback: extract largest embedded image per page via PyPDF2/Pillow.
+    """
+    # First try pdf2image for full-page rasterization
    try:
-        import fitz  # PyMuPDF
-
-        # Open PDF
-        pdf_doc = fitz.open(stream=pdf_content, filetype="pdf")
-
-        images: list[Any] = []
-        for page_num in range(min(len(pdf_doc), settings.max_pages)):
-            page = pdf_doc[page_num]
-
-            # Render page to image
-            mat = fitz.Matrix(2.0, 2.0)  # 2x zoom for better OCR
-            pix = page.get_pixmap(matrix=mat)
-            img_data = pix.tobytes("png")
-
-            images.append(img_data)
-
-        pdf_doc.close()
-        return images
-
-    except ImportError:
-        logger.error("PyMuPDF not available, using fallback")
-        return await _pdf_to_images_fallback(pdf_content)
-    except Exception as e:
-        logger.error("PDF conversion failed", error=str(e))
-        raise
-
-
-async def _pdf_to_images_fallback(pdf_content: bytes) -> list[bytes]:
-    """Fallback PDF to images conversion"""
-    try:
-        from pdf2image import convert_from_bytes
-
        images = convert_from_bytes(
            pdf_content, dpi=200, first_page=1, last_page=settings.max_pages
        )
-
-        # Convert PIL images to bytes
-        image_bytes: list[Any] = []
+        image_bytes: list[bytes] = []
        for img in images:
-            import io
-
            img_buffer = io.BytesIO()
            img.save(img_buffer, format="PNG")
            image_bytes.append(img_buffer.getvalue())
-
        return image_bytes
+    except Exception as e:
+        logger.warning(
+            "pdf2image conversion failed; falling back to PyPDF2", error=str(e)
+        )

-    except ImportError:
-        logger.error("pdf2image not available")
-        raise Exception("No PDF conversion library available")
+    # Fallback: extract largest embedded image per page using PyPDF2
+    try:
+        reader = PdfReader(io.BytesIO(pdf_content))
+        out_images: list[bytes] = []
+        for page_index, page in enumerate(reader.pages):
+            if page_index >= settings.max_pages:
+                break
+            try:
+                resources = page.get("/Resources")
+                if resources is None:
+                    continue
+                xobject = resources.get("/XObject")
+                if xobject is None:
+                    continue
+                xobject = xobject.get_object()
+
+                largest = None
+                largest_area = -1
+                for _, obj_ref in xobject.items():
+                    try:
+                        obj = obj_ref.get_object()
+                        if obj.get("/Subtype") != "/Image":
+                            continue
+                        width = int(obj.get("/Width", 0))
+                        height = int(obj.get("/Height", 0))
+                        area = width * height
+                        if area > largest_area:
+                            largest = obj
+                            largest_area = area
+                    except Exception:
+                        continue
+
+                if largest is None:
+                    continue
+
+                data = largest.get_data()
+                filt = largest.get("/Filter")
+
+                if filt in ("/DCTDecode", "/JPXDecode"):
+                    # JPEG or JPEG2000
+                    out_images.append(data)
+                else:
+                    # Flate or other; decode via Pillow
+                    mode = "RGB"
+                    colorspace = largest.get("/ColorSpace")
+                    if colorspace in ("/DeviceGray",):
+                        mode = "L"
+                    width = int(largest.get("/Width", 0))
+                    height = int(largest.get("/Height", 0))
+                    try:
+                        img = Image.frombytes(mode, (width, height), data)
+                    except Exception:
+                        img = Image.open(io.BytesIO(data))
+                    buf = io.BytesIO()
+                    img.save(buf, format="PNG")
+                    out_images.append(buf.getvalue())
+            except Exception:
+                continue
+
+        if not out_images:
+            raise RuntimeError("No images extracted via PyPDF2 fallback")
+        return out_images
+    except Exception as fallback_e:
+        logger.error("PDF conversion failed (both methods)", error=str(fallback_e))
+        raise


 async def _process_page(
@@ -395,6 +464,8 @@ async def _process_page(
                layoutlm_result.get("confidence", 0),
            ),
        }
+    elif strategy == "vision":
+        return await _process_with_vision(image_data, page_num)
    else:
        raise ValueError(f"Unknown strategy: {strategy}")

@@ -402,11 +473,6 @@ async def _process_page(
 async def _process_with_tesseract(image_data: bytes, page_num: int) -> dict[str, Any]:
    """Process page with Tesseract OCR"""
    try:
-        import io
-
-        import pytesseract
-        from PIL import Image
-
        # Load image
        image = Image.open(io.BytesIO(image_data))

@@ -414,13 +480,13 @@ async def _process_with_tesseract(image_data: bytes, page_num: int) -> dict[str,
        config = f"{settings.tesseract_config} -l {settings.languages}"

        # Extract text with confidence
-        data = pytesseract.image_to_data(
+        data = pytesseract.image_to_data(  # type: ignore
            image, config=config, output_type=pytesseract.Output.DICT
        )

        # Process results
-        words: list[Any] = []
-        confidences: list[Any] = []
+        words: list[dict[str, Any]] = []
+        confidences: list[float] = []

        for i in range(len(data["text"])):
            if int(data["conf"][i]) > 0:  # Valid confidence
@@ -449,13 +515,6 @@ async def _process_with_tesseract(image_data: bytes, page_num: int) -> dict[str,
            "word_count": len(words),
        }

-    except ImportError:
-        logger.error("pytesseract not available")
-        return {
-            "page": page_num,
-            "strategy": "tesseract",
-            "error": "pytesseract not available",
-        }
    except Exception as e:
        logger.error("Tesseract processing failed", page=page_num, error=str(e))
        return {"page": page_num, "strategy": "tesseract", "error": str(e)}
@@ -482,6 +541,68 @@ async def _process_with_layoutlm(image_data: bytes, page_num: int) -> dict[str,
        return {"page": page_num, "strategy": "layoutlm", "error": str(e)}


+async def _process_with_vision(image_data: bytes, page_num: int) -> dict[str, Any]:
+    """Process page with LLM vision OCR via shared OCRProcessor"""
+    try:
+        vp = vision_processor
+        if vp is None:
+            raise RuntimeError("Vision OCR processor not initialized")
+
+        # Persist the page image temporarily for the processor API
+        import tempfile
+
+        with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
+            tmp.write(image_data)
+            tmp_path = tmp.name
+
+        try:
+            text = vp.process_image(
+                image_path=tmp_path,
+                format_type=settings.vision_format,
+                preprocess=settings.vision_preprocess,
+                language=settings.languages,
+            )
+        finally:
+            try:
+                os.remove(tmp_path)
+            except OSError:
+                pass
+
+        return {
+            "page": page_num,
+            "strategy": "vision",
+            "text": text if isinstance(text, str) else str(text),
+            "confidence": 0.0,  # Not provided by LLM API
+        }
+    except Exception as e:
+        logger.error("Vision processing failed", page=page_num, error=str(e))
+        return {"page": page_num, "strategy": "vision", "error": str(e)}
+
+
+def _schedule_process_document_async(
+    doc_id: str,
+    tenant_id: str,
+    content: bytes,
+    strategy: str,
+    processing_id: str,
+    actor: str,
+) -> None:
+    """Sync wrapper to schedule the async OCR task.
+
+    This keeps FastAPI BackgroundTasks type expectations satisfied under mypy strict.
+    """
+    asyncio.create_task(
+        _process_document_async(
+            doc_id=doc_id,
+            tenant_id=tenant_id,
+            content=content,
+            strategy=strategy,
+            processing_id=processing_id,
+            actor=actor,
+        )
+    )
+
+
@app.exception_handler(HTTPException)
 async def http_exception_handler(request: Request, exc: HTTPException) -> JSONResponse:
    """Handle HTTP exceptions with RFC7807 format"""
--- a/apps/svc_ocr/requirements.txt
+++ b/apps/svc_ocr/requirements.txt
@@ -5,7 +5,7 @@
 pytesseract>=0.3.13

 # PDF processing
-PyMuPDF>=1.26.4
+PyPDF2>=3.0.1
 pdf2image>=1.17.0

 # Image processing