Initial commit

2025-10-11 08:41:36 +01:00
commit b324ff09ef
276 changed files with 55220 additions and 0 deletions
--- a/apps/svc_ocr/Dockerfile
+++ b/apps/svc_ocr/Dockerfile
@@ -0,0 +1,43 @@
+# Dockerfile for svc_ocr - Uses base-ml image
+# Base image contains: FastAPI, database drivers, transformers, PyTorch, numpy, etc.
+# This Dockerfile adds OCR-specific dependencies and application code
+
+ARG REGISTRY=gitea.harkon.co.uk
+ARG OWNER=harkon
+ARG BASE_VERSION=v1.0.1
+FROM ${REGISTRY}/${OWNER}/base-ml:${BASE_VERSION}
+
+# Switch to root to install system and service-specific dependencies
+USER root
+
+# Install OCR runtime dependencies (Tesseract, poppler)
+RUN apt-get update && apt-get install -y \
+    tesseract-ocr \
+    tesseract-ocr-eng \
+    poppler-utils \
+    && rm -rf /var/lib/apt/lists/*
+
+# Set working directory
+WORKDIR /app
+
+# Copy service-specific requirements and install
+COPY apps/svc_ocr/requirements.txt /tmp/service-requirements.txt
+RUN pip install --no-cache-dir -r /tmp/service-requirements.txt
+
+# Copy application code
+COPY libs/ ./libs/
+COPY apps/svc_ocr/ ./apps/svc_ocr/
+
+# Set permissions and switch to non-root user
+RUN chown -R appuser:appuser /app
+USER appuser
+
+# Health check
+HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
+    CMD curl -f http://localhost:8000/healthz || exit 1
+
+# Expose port
+EXPOSE 8000
+
+# Run the application
+CMD ["python", "-m", "uvicorn", "apps.svc_ocr.main:app", "--host", "0.0.0.0", "--port", "8000"]
--- a/apps/svc_ocr/main.py
+++ b/apps/svc_ocr/main.py
@@ -0,0 +1,504 @@
+# FILE: apps/svc-ocr/main.py
+# OCR and layout extraction using Tesseract, LayoutLM, and document AI
+
+import os
+
+# Import shared libraries
+import sys
+from datetime import datetime
+from typing import Any
+
+import structlog
+import ulid
+from fastapi import BackgroundTasks, Depends, HTTPException, Request
+from fastapi.responses import JSONResponse
+
+sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
+
+from libs.app_factory import create_app
+from libs.config import BaseAppSettings, create_event_bus, create_minio_client
+from libs.events import EventBus, EventPayload, EventTopics
+from libs.observability import get_metrics, get_tracer, setup_observability
+from libs.schemas import ErrorResponse
+from libs.security import get_current_user, get_tenant_id
+from libs.storage import DocumentStorage, StorageClient
+
+logger = structlog.get_logger()
+
+
+class OCRSettings(BaseAppSettings):
+    """Settings for OCR service"""
+
+    service_name: str = "svc-ocr"
+
+    # OCR configuration
+    tesseract_cmd: str = "/usr/bin/tesseract"
+    tesseract_config: str = "--oem 3 --psm 6"
+    languages: str = "eng"
+
+    # Layout analysis
+    layoutlm_model: str = "microsoft/layoutlm-base-uncased"
+    confidence_threshold: float = 0.7
+
+    # Processing limits
+    max_pages: int = 50
+    max_file_size: int = 100 * 1024 * 1024  # 100MB
+
+    # Output configuration
+    include_coordinates: bool = True
+    include_confidence: bool = True
+
+
+# Create app and settings
+app, settings = create_app(
+    service_name="svc-ocr",
+    title="Tax Agent OCR Service",
+    description="OCR and layout extraction service",
+    settings_class=OCRSettings,
+)  # fmt: skip
+
+# Global clients
+storage_client: StorageClient | None = None
+document_storage: DocumentStorage | None = None
+event_bus: EventBus | None = None
+tracer = get_tracer("svc-ocr")
+metrics = get_metrics()
+
+
+@app.on_event("startup")
+async def startup_event() -> None:
+    """Initialize service dependencies"""
+    global storage_client, document_storage, event_bus
+
+    logger.info("Starting OCR service")
+
+    # Setup observability
+    setup_observability(settings)
+
+    # Initialize MinIO client
+    minio_client = create_minio_client(settings)
+    storage_client = StorageClient(minio_client)
+    document_storage = DocumentStorage(storage_client)
+
+    # Initialize event bus
+    event_bus = create_event_bus(settings)
+    if not event_bus:
+        raise HTTPException(status_code=500, detail="Event bus not initialized")
+
+    await event_bus.start()
+
+    # Subscribe to document ingestion events
+    await event_bus.subscribe(EventTopics.DOC_INGESTED, _handle_document_ingested)
+
+    logger.info("OCR service started successfully")
+
+
+@app.on_event("shutdown")
+async def shutdown_event() -> None:
+    """Cleanup service dependencies"""
+    global event_bus
+
+    logger.info("Shutting down OCR service")
+
+    if event_bus:
+        await event_bus.stop()
+
+    logger.info("OCR service shutdown complete")
+
+
+@app.get("/health")
+async def health_check() -> dict[str, Any]:
+    """Health check endpoint"""
+    return {
+        "status": "healthy",
+        "service": settings.service_name,
+        "version": settings.service_version,
+        "timestamp": datetime.utcnow().isoformat(),
+    }
+
+
+@app.post("/process/{doc_id}")
+async def process_document(
+    doc_id: str,
+    background_tasks: BackgroundTasks,
+    strategy: str = "hybrid",
+    current_user: dict[str, Any] = Depends(get_current_user),
+    tenant_id: str = Depends(get_tenant_id),
+) -> dict[str, Any]:
+    """Process document with OCR"""
+
+    with tracer.start_as_current_span("process_document") as span:
+        span.set_attribute("doc_id", doc_id)
+        span.set_attribute("tenant_id", tenant_id)
+        span.set_attribute("strategy", strategy)
+
+        try:
+            # Check if document exists
+            doc_content = await document_storage.get_document(tenant_id, doc_id)
+            if not doc_content:
+                raise HTTPException(status_code=404, detail="Document not found")
+
+            # Generate processing ID
+            processing_id = str(ulid.new())
+            span.set_attribute("processing_id", processing_id)
+
+            # Start background processing
+            background_tasks.add_task(
+                _process_document_async,
+                doc_id,
+                tenant_id,
+                doc_content,
+                strategy,
+                processing_id,
+                current_user.get("sub", "system"),
+            )
+
+            logger.info(
+                "OCR processing started", doc_id=doc_id, processing_id=processing_id
+            )
+
+            return {
+                "processing_id": processing_id,
+                "doc_id": doc_id,
+                "status": "processing",
+                "strategy": strategy,
+            }
+
+        except HTTPException:
+            raise
+        except Exception as e:
+            logger.error("Failed to start OCR processing", doc_id=doc_id, error=str(e))
+            raise HTTPException(status_code=500, detail="Failed to start processing")
+
+
+@app.get("/results/{doc_id}")
+async def get_ocr_results(
+    doc_id: str,
+    current_user: dict[str, Any] = Depends(get_current_user),
+    tenant_id: str = Depends(get_tenant_id),
+) -> dict[str, Any]:
+    """Get OCR results for document"""
+
+    with tracer.start_as_current_span("get_ocr_results") as span:
+        span.set_attribute("doc_id", doc_id)
+        span.set_attribute("tenant_id", tenant_id)
+
+        try:
+            # Get OCR results from storage
+            ocr_results = await document_storage.get_ocr_result(tenant_id, doc_id)
+
+            if not ocr_results:
+                raise HTTPException(status_code=404, detail="OCR results not found")
+
+            return ocr_results
+
+        except HTTPException:
+            raise
+        except Exception as e:
+            logger.error("Failed to get OCR results", doc_id=doc_id, error=str(e))
+            raise HTTPException(status_code=500, detail="Failed to get OCR results")
+
+
+async def _handle_document_ingested(topic: str, payload: EventPayload) -> None:
+    """Handle document ingestion events"""
+    try:
+        data = payload.data
+        doc_id = data.get("doc_id")
+        tenant_id = data.get("tenant_id")
+
+        if not doc_id or not tenant_id:
+            logger.warning("Invalid document ingestion event", data=data)
+            return
+
+        # Auto-process PDF documents
+        if data.get("content_type") == "application/pdf":
+            logger.info("Auto-processing ingested document", doc_id=doc_id)
+
+            # Get document content
+            doc_content = await document_storage.get_document(tenant_id, doc_id)
+            if doc_content:
+                await _process_document_async(
+                    doc_id=doc_id,
+                    tenant_id=tenant_id,
+                    content=doc_content,
+                    strategy="hybrid",
+                    processing_id=str(ulid.new()),
+                    actor=payload.actor,
+                )
+
+    except Exception as e:
+        logger.error("Failed to handle document ingestion", error=str(e))
+
+
+async def _process_document_async(
+    doc_id: str,
+    tenant_id: str,
+    content: bytes,
+    strategy: str,
+    processing_id: str,
+    actor: str,
+) -> None:
+    """Process document asynchronously"""
+
+    with tracer.start_as_current_span("process_document_async") as span:
+        span.set_attribute("doc_id", doc_id)
+        span.set_attribute("processing_id", processing_id)
+        span.set_attribute("strategy", strategy)
+
+        try:
+            # Convert PDF to images
+            images = await _pdf_to_images(content)
+
+            # Process each page
+            pages_data: list[Any] = []
+            for page_num, image in enumerate(images, 1):
+                page_data = await _process_page(image, page_num, strategy)
+                pages_data.append(page_data)
+
+            # Combine results
+            ocr_results = {
+                "doc_id": doc_id,
+                "processing_id": processing_id,
+                "strategy": strategy,
+                "processed_at": datetime.utcnow().isoformat(),
+                "total_pages": len(pages_data),
+                "pages": pages_data,
+                "metadata": {
+                    "confidence_threshold": settings.confidence_threshold,
+                    "languages": settings.languages,
+                },
+            }
+
+            # Store results
+            await document_storage.store_ocr_result(tenant_id, doc_id, ocr_results)
+
+            # Update metrics
+            metrics.counter("documents_processed_total").labels(
+                tenant_id=tenant_id, strategy=strategy
+            ).inc()
+
+            metrics.histogram("processing_duration_seconds").labels(
+                strategy=strategy
+            ).observe(
+                datetime.utcnow().timestamp()
+                - datetime.fromisoformat(
+                    ocr_results["processed_at"].replace("Z", "")
+                ).timestamp()
+            )
+
+            # Publish completion event
+            event_payload = EventPayload(
+                data={
+                    "doc_id": doc_id,
+                    "tenant_id": tenant_id,
+                    "processing_id": processing_id,
+                    "strategy": strategy,
+                    "total_pages": len(pages_data),
+                    "ocr_results": ocr_results,
+                },
+                actor=actor,
+                tenant_id=tenant_id,
+            )
+
+            await event_bus.publish(EventTopics.DOC_OCR_READY, event_payload)
+
+            logger.info(
+                "OCR processing completed", doc_id=doc_id, pages=len(pages_data)
+            )
+
+        except Exception as e:
+            logger.error("OCR processing failed", doc_id=doc_id, error=str(e))
+
+            # Update error metrics
+            metrics.counter("processing_errors_total").labels(
+                tenant_id=tenant_id, strategy=strategy, error_type=type(e).__name__
+            ).inc()
+
+
+async def _pdf_to_images(pdf_content: bytes) -> list[bytes]:
+    """Convert PDF to images"""
+    try:
+        import fitz  # PyMuPDF
+
+        # Open PDF
+        pdf_doc = fitz.open(stream=pdf_content, filetype="pdf")
+
+        images: list[Any] = []
+        for page_num in range(min(len(pdf_doc), settings.max_pages)):
+            page = pdf_doc[page_num]
+
+            # Render page to image
+            mat = fitz.Matrix(2.0, 2.0)  # 2x zoom for better OCR
+            pix = page.get_pixmap(matrix=mat)
+            img_data = pix.tobytes("png")
+
+            images.append(img_data)
+
+        pdf_doc.close()
+        return images
+
+    except ImportError:
+        logger.error("PyMuPDF not available, using fallback")
+        return await _pdf_to_images_fallback(pdf_content)
+    except Exception as e:
+        logger.error("PDF conversion failed", error=str(e))
+        raise
+
+
+async def _pdf_to_images_fallback(pdf_content: bytes) -> list[bytes]:
+    """Fallback PDF to images conversion"""
+    try:
+        from pdf2image import convert_from_bytes
+
+        images = convert_from_bytes(
+            pdf_content, dpi=200, first_page=1, last_page=settings.max_pages
+        )
+
+        # Convert PIL images to bytes
+        image_bytes: list[Any] = []
+        for img in images:
+            import io
+
+            img_buffer = io.BytesIO()
+            img.save(img_buffer, format="PNG")
+            image_bytes.append(img_buffer.getvalue())
+
+        return image_bytes
+
+    except ImportError:
+        logger.error("pdf2image not available")
+        raise Exception("No PDF conversion library available")
+
+
+async def _process_page(
+    image_data: bytes, page_num: int, strategy: str
+) -> dict[str, Any]:
+    """Process single page with OCR"""
+
+    if strategy == "tesseract":
+        return await _process_with_tesseract(image_data, page_num)
+    elif strategy == "layoutlm":
+        return await _process_with_layoutlm(image_data, page_num)
+    elif strategy == "hybrid":
+        # Combine both approaches
+        tesseract_result = await _process_with_tesseract(image_data, page_num)
+        layoutlm_result = await _process_with_layoutlm(image_data, page_num)
+
+        return {
+            "page": page_num,
+            "strategy": "hybrid",
+            "tesseract": tesseract_result,
+            "layoutlm": layoutlm_result,
+            "text": tesseract_result.get("text", ""),
+            "confidence": max(
+                tesseract_result.get("confidence", 0),
+                layoutlm_result.get("confidence", 0),
+            ),
+        }
+    else:
+        raise ValueError(f"Unknown strategy: {strategy}")
+
+
+async def _process_with_tesseract(image_data: bytes, page_num: int) -> dict[str, Any]:
+    """Process page with Tesseract OCR"""
+    try:
+        import io
+
+        import pytesseract
+        from PIL import Image
+
+        # Load image
+        image = Image.open(io.BytesIO(image_data))
+
+        # Configure Tesseract
+        config = f"{settings.tesseract_config} -l {settings.languages}"
+
+        # Extract text with confidence
+        data = pytesseract.image_to_data(
+            image, config=config, output_type=pytesseract.Output.DICT
+        )
+
+        # Process results
+        words: list[Any] = []
+        confidences: list[Any] = []
+
+        for i in range(len(data["text"])):
+            if int(data["conf"][i]) > 0:  # Valid confidence
+                word_data = {
+                    "text": data["text"][i],
+                    "confidence": int(data["conf"][i]) / 100.0,
+                    "bbox": [
+                        data["left"][i],
+                        data["top"][i],
+                        data["left"][i] + data["width"][i],
+                        data["top"][i] + data["height"][i],
+                    ],
+                }
+                words.append(word_data)
+                confidences.append(word_data["confidence"])
+
+        # Extract full text
+        full_text = pytesseract.image_to_string(image, config=config)
+
+        return {
+            "page": page_num,
+            "strategy": "tesseract",
+            "text": full_text.strip(),
+            "words": words,
+            "confidence": sum(confidences) / len(confidences) if confidences else 0.0,
+            "word_count": len(words),
+        }
+
+    except ImportError:
+        logger.error("pytesseract not available")
+        return {
+            "page": page_num,
+            "strategy": "tesseract",
+            "error": "pytesseract not available",
+        }
+    except Exception as e:
+        logger.error("Tesseract processing failed", page=page_num, error=str(e))
+        return {"page": page_num, "strategy": "tesseract", "error": str(e)}
+
+
+async def _process_with_layoutlm(image_data: bytes, page_num: int) -> dict[str, Any]:
+    """Process page with LayoutLM"""
+    try:
+        # This would integrate with LayoutLM model
+        # For now, return placeholder
+        logger.warning("LayoutLM processing not implemented")
+
+        return {
+            "page": page_num,
+            "strategy": "layoutlm",
+            "text": "",
+            "layout_elements": [],
+            "confidence": 0.0,
+            "error": "Not implemented",
+        }
+
+    except Exception as e:
+        logger.error("LayoutLM processing failed", page=page_num, error=str(e))
+        return {"page": page_num, "strategy": "layoutlm", "error": str(e)}
+
+
+@app.exception_handler(HTTPException)
+async def http_exception_handler(request: Request, exc: HTTPException) -> JSONResponse:
+    """Handle HTTP exceptions with RFC7807 format"""
+    return JSONResponse(
+        status_code=exc.status_code,
+        content=ErrorResponse(
+            type=f"https://httpstatuses.com/{exc.status_code}",
+            title=exc.detail,
+            status=exc.status_code,
+            detail=exc.detail,
+            instance=str(request.url),
+            trace_id="",
+        ).model_dump(),
+    )
+
+
+if __name__ == "__main__":
+    import uvicorn
+
+    uvicorn.run("main:app", host="0.0.0.0", port=8002, reload=True, log_config=None)
--- a/apps/svc_ocr/requirements.txt
+++ b/apps/svc_ocr/requirements.txt
@@ -0,0 +1,16 @@
+# Service-specific dependencies for svc_ocr
+# NOTE: ML dependencies (transformers, torch, numpy) are in base-ml image
+
+# OCR engines (lightweight)
+pytesseract>=0.3.13
+
+# PDF processing
+PyMuPDF>=1.26.4
+pdf2image>=1.17.0
+
+# Image processing
+Pillow>=11.3.0
+opencv-python-headless>=4.12.0.88  # Headless version is smaller
+
+# Computer vision (torchvision not in base-ml)
+torchvision>=0.23.0