deployment, linting and infra configuration
Some checks failed
CI/CD Pipeline / Code Quality & Linting (push) Has been cancelled
CI/CD Pipeline / Policy Validation (push) Has been cancelled
CI/CD Pipeline / Test Suite (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-firm-connectors) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-forms) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-hmrc) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ingestion) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-normalize-map) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ocr) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-indexer) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-reason) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rpa) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (ui-review) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (ui-review) (push) Has been cancelled
CI/CD Pipeline / Generate SBOM (push) Has been cancelled
CI/CD Pipeline / Deploy to Staging (push) Has been cancelled
CI/CD Pipeline / Deploy to Production (push) Has been cancelled
CI/CD Pipeline / Notifications (push) Has been cancelled

This commit is contained in:
harkon
2025-10-14 07:42:31 +01:00
parent f0f7674b8d
commit eea46ac89c
41 changed files with 1017 additions and 1448 deletions

View File

@@ -1,17 +1,23 @@
# FILE: apps/svc-ocr/main.py
# OCR and layout extraction using Tesseract, LayoutLM, and document AI
import asyncio
import io
import os
# Import shared libraries
import sys
from datetime import datetime
from typing import Any
from typing import Any, cast
import pytesseract
import structlog
import ulid
from fastapi import BackgroundTasks, Depends, HTTPException, Request
from fastapi.responses import JSONResponse
from pdf2image import convert_from_bytes
from PIL import Image
from PyPDF2 import PdfReader
sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
@@ -19,6 +25,7 @@ from libs.app_factory import create_app
from libs.config import BaseAppSettings, create_event_bus, create_minio_client
from libs.events import EventBus, EventPayload, EventTopics
from libs.observability import get_metrics, get_tracer, setup_observability
from libs.ocr.processor import OCRProcessor
from libs.schemas import ErrorResponse
from libs.security import get_current_user, get_tenant_id
from libs.storage import DocumentStorage, StorageClient
@@ -48,28 +55,31 @@ class OCRSettings(BaseAppSettings):
include_coordinates: bool = True
include_confidence: bool = True
# Vision/LLM OCR configuration
vision_provider: str = "ollama" # or "openai"
vision_model: str = "llama3.2-vision:11b"
vision_format: str = (
"text" # text | markdown | json | table | key_value | structured
)
vision_preprocess: bool = True
openai_base_url: str = "https://api.openai.com/v1/chat/completions"
# Create app and settings
app, settings = create_app(
service_name="svc-ocr",
title="Tax Agent OCR Service",
description="OCR and layout extraction service",
settings_class=OCRSettings,
) # fmt: skip
# Global clients
storage_client: StorageClient | None = None
document_storage: DocumentStorage | None = None
event_bus: EventBus | None = None
tracer = get_tracer("svc-ocr")
metrics = get_metrics()
vision_processor: OCRProcessor | None = None
# Settings will be initialized after app creation
settings: OCRSettings
@app.on_event("startup")
async def startup_event() -> None:
async def init_dependencies(app_settings: OCRSettings) -> None:
"""Initialize service dependencies"""
global storage_client, document_storage, event_bus
global storage_client, document_storage, event_bus, settings, vision_processor
settings = app_settings
logger.info("Starting OCR service")
# Setup observability
@@ -79,42 +89,44 @@ async def startup_event() -> None:
minio_client = create_minio_client(settings)
storage_client = StorageClient(minio_client)
document_storage = DocumentStorage(storage_client)
# Initialize event bus
event_bus = create_event_bus(settings)
if not event_bus:
raise HTTPException(status_code=500, detail="Event bus not initialized")
await event_bus.start()
eb = event_bus
# mypy: event_bus is Optional, so use local alias after check
await eb.start()
# Subscribe to document ingestion events
await event_bus.subscribe(EventTopics.DOC_INGESTED, _handle_document_ingested)
await eb.subscribe(EventTopics.DOC_INGESTED, _handle_document_ingested)
# Initialize shared OCRProcessor for vision strategy
try:
vision_processor = OCRProcessor(
model_name=settings.vision_model,
provider=settings.vision_provider,
openai_base_url=settings.openai_base_url,
)
except Exception as e:
logger.error("Failed to initialize vision OCR processor", error=str(e))
logger.info("OCR service started successfully")
@app.on_event("shutdown")
async def shutdown_event() -> None:
"""Cleanup service dependencies"""
global event_bus
# Create app and settings
app, _settings = create_app(
service_name="svc-ocr",
title="Tax Agent OCR Service",
description="OCR and layout extraction service",
settings_class=OCRSettings,
) # fmt: skip
logger.info("Shutting down OCR service")
# Initialize dependencies immediately
asyncio.run(init_dependencies(cast(OCRSettings, _settings)))
if event_bus:
await event_bus.stop()
logger.info("OCR service shutdown complete")
@app.get("/health")
async def health_check() -> dict[str, Any]:
"""Health check endpoint"""
return {
"status": "healthy",
"service": settings.service_name,
"version": settings.service_version,
"timestamp": datetime.utcnow().isoformat(),
}
tracer = get_tracer("svc-ocr")
metrics = get_metrics()
@app.post("/process/{doc_id}")
@@ -132,9 +144,14 @@ async def process_document(
span.set_attribute("tenant_id", tenant_id)
span.set_attribute("strategy", strategy)
ds = document_storage
if ds is None:
raise HTTPException(
status_code=500, detail="Document storage not initialized"
)
try:
# Check if document exists
doc_content = await document_storage.get_document(tenant_id, doc_id)
doc_content = await ds.get_document(tenant_id, doc_id)
if not doc_content:
raise HTTPException(status_code=404, detail="Document not found")
@@ -142,9 +159,9 @@ async def process_document(
processing_id = str(ulid.new())
span.set_attribute("processing_id", processing_id)
# Start background processing
# Start background processing via sync wrapper (for mypy correctness)
background_tasks.add_task(
_process_document_async,
_schedule_process_document_async,
doc_id,
tenant_id,
doc_content,
@@ -168,7 +185,9 @@ async def process_document(
raise
except Exception as e:
logger.error("Failed to start OCR processing", doc_id=doc_id, error=str(e))
raise HTTPException(status_code=500, detail="Failed to start processing")
raise HTTPException(
status_code=500, detail="Failed to start processing"
) from e
@app.get("/results/{doc_id}")
@@ -183,9 +202,14 @@ async def get_ocr_results(
span.set_attribute("doc_id", doc_id)
span.set_attribute("tenant_id", tenant_id)
ds = document_storage
if ds is None:
raise HTTPException(
status_code=500, detail="Document storage not initialized"
)
try:
# Get OCR results from storage
ocr_results = await document_storage.get_ocr_result(tenant_id, doc_id)
ocr_results = await ds.get_ocr_result(tenant_id, doc_id)
if not ocr_results:
raise HTTPException(status_code=404, detail="OCR results not found")
@@ -196,26 +220,32 @@ async def get_ocr_results(
raise
except Exception as e:
logger.error("Failed to get OCR results", doc_id=doc_id, error=str(e))
raise HTTPException(status_code=500, detail="Failed to get OCR results")
raise HTTPException(
status_code=500, detail="Failed to get OCR results"
) from e
async def _handle_document_ingested(topic: str, payload: EventPayload) -> None:
"""Handle document ingestion events"""
try:
data = payload.data
doc_id = data.get("doc_id")
tenant_id = data.get("tenant_id")
data = payload.data
doc_id = data.get("doc_id")
tenant_id = data.get("tenant_id")
if not doc_id or not tenant_id:
logger.warning("Invalid document ingestion event", data=data)
return
if not doc_id or not tenant_id:
logger.warning("Invalid document ingestion event", data=data)
return
ds = document_storage
if ds is None:
logger.error("Document storage not initialized")
return
# Auto-process PDF documents
if data.get("content_type") == "application/pdf":
logger.info("Auto-processing ingested document", doc_id=doc_id)
# Auto-process PDF documents
if data.get("content_type") == "application/pdf":
logger.info("Auto-processing ingested document", doc_id=doc_id)
try:
# Get document content
doc_content = await document_storage.get_document(tenant_id, doc_id)
doc_content = await ds.get_document(tenant_id, doc_id)
if doc_content:
await _process_document_async(
doc_id=doc_id,
@@ -225,9 +255,10 @@ async def _handle_document_ingested(topic: str, payload: EventPayload) -> None:
processing_id=str(ulid.new()),
actor=payload.actor,
)
except Exception as e:
logger.error("Failed to handle document ingestion", error=str(e))
except Exception as e:
logger.error(
"Failed to handle document ingestion", doc_id=doc_id, error=str(e)
)
async def _process_document_async(
@@ -250,8 +281,8 @@ async def _process_document_async(
images = await _pdf_to_images(content)
# Process each page
pages_data: list[Any] = []
for page_num, image in enumerate(images, 1):
pages_data: list[dict[str, Any]] = []
for page_num, image in enumerate(images, 0):
page_data = await _process_page(image, page_num, strategy)
pages_data.append(page_data)
@@ -270,7 +301,10 @@ async def _process_document_async(
}
# Store results
await document_storage.store_ocr_result(tenant_id, doc_id, ocr_results)
ds = document_storage
if ds is None:
raise RuntimeError("Document storage not initialized")
await ds.store_ocr_result(tenant_id, doc_id, ocr_results)
# Update metrics
metrics.counter("documents_processed_total").labels(
@@ -282,7 +316,7 @@ async def _process_document_async(
).observe(
datetime.utcnow().timestamp()
- datetime.fromisoformat(
ocr_results["processed_at"].replace("Z", "")
ocr_results["processed_at"].replace("Z", "") # type: ignore
).timestamp()
)
@@ -300,7 +334,9 @@ async def _process_document_async(
tenant_id=tenant_id,
)
await event_bus.publish(EventTopics.DOC_OCR_READY, event_payload)
eb = event_bus
if eb is not None:
await eb.publish(EventTopics.DOC_OCR_READY, event_payload)
logger.info(
"OCR processing completed", doc_id=doc_id, pages=len(pages_data)
@@ -316,58 +352,91 @@ async def _process_document_async(
async def _pdf_to_images(pdf_content: bytes) -> list[bytes]:
"""Convert PDF to images"""
"""Convert PDF to page images without PyMuPDF.
Primary: pdf2image (requires poppler). Fallback: extract largest embedded image per page via PyPDF2/Pillow.
"""
# First try pdf2image for full-page rasterization
try:
import fitz # PyMuPDF
# Open PDF
pdf_doc = fitz.open(stream=pdf_content, filetype="pdf")
images: list[Any] = []
for page_num in range(min(len(pdf_doc), settings.max_pages)):
page = pdf_doc[page_num]
# Render page to image
mat = fitz.Matrix(2.0, 2.0) # 2x zoom for better OCR
pix = page.get_pixmap(matrix=mat)
img_data = pix.tobytes("png")
images.append(img_data)
pdf_doc.close()
return images
except ImportError:
logger.error("PyMuPDF not available, using fallback")
return await _pdf_to_images_fallback(pdf_content)
except Exception as e:
logger.error("PDF conversion failed", error=str(e))
raise
async def _pdf_to_images_fallback(pdf_content: bytes) -> list[bytes]:
"""Fallback PDF to images conversion"""
try:
from pdf2image import convert_from_bytes
images = convert_from_bytes(
pdf_content, dpi=200, first_page=1, last_page=settings.max_pages
)
# Convert PIL images to bytes
image_bytes: list[Any] = []
image_bytes: list[bytes] = []
for img in images:
import io
img_buffer = io.BytesIO()
img.save(img_buffer, format="PNG")
image_bytes.append(img_buffer.getvalue())
return image_bytes
except Exception as e:
logger.warning(
"pdf2image conversion failed; falling back to PyPDF2", error=str(e)
)
except ImportError:
logger.error("pdf2image not available")
raise Exception("No PDF conversion library available")
# Fallback: extract largest embedded image per page using PyPDF2
try:
reader = PdfReader(io.BytesIO(pdf_content))
out_images: list[bytes] = []
for page_index, page in enumerate(reader.pages):
if page_index >= settings.max_pages:
break
try:
resources = page.get("/Resources")
if resources is None:
continue
xobject = resources.get("/XObject")
if xobject is None:
continue
xobject = xobject.get_object()
largest = None
largest_area = -1
for _, obj_ref in xobject.items():
try:
obj = obj_ref.get_object()
if obj.get("/Subtype") != "/Image":
continue
width = int(obj.get("/Width", 0))
height = int(obj.get("/Height", 0))
area = width * height
if area > largest_area:
largest = obj
largest_area = area
except Exception:
continue
if largest is None:
continue
data = largest.get_data()
filt = largest.get("/Filter")
if filt in ("/DCTDecode", "/JPXDecode"):
# JPEG or JPEG2000
out_images.append(data)
else:
# Flate or other; decode via Pillow
mode = "RGB"
colorspace = largest.get("/ColorSpace")
if colorspace in ("/DeviceGray",):
mode = "L"
width = int(largest.get("/Width", 0))
height = int(largest.get("/Height", 0))
try:
img = Image.frombytes(mode, (width, height), data)
except Exception:
img = Image.open(io.BytesIO(data))
buf = io.BytesIO()
img.save(buf, format="PNG")
out_images.append(buf.getvalue())
except Exception:
continue
if not out_images:
raise RuntimeError("No images extracted via PyPDF2 fallback")
return out_images
except Exception as fallback_e:
logger.error("PDF conversion failed (both methods)", error=str(fallback_e))
raise
async def _process_page(
@@ -395,6 +464,8 @@ async def _process_page(
layoutlm_result.get("confidence", 0),
),
}
elif strategy == "vision":
return await _process_with_vision(image_data, page_num)
else:
raise ValueError(f"Unknown strategy: {strategy}")
@@ -402,11 +473,6 @@ async def _process_page(
async def _process_with_tesseract(image_data: bytes, page_num: int) -> dict[str, Any]:
"""Process page with Tesseract OCR"""
try:
import io
import pytesseract
from PIL import Image
# Load image
image = Image.open(io.BytesIO(image_data))
@@ -414,13 +480,13 @@ async def _process_with_tesseract(image_data: bytes, page_num: int) -> dict[str,
config = f"{settings.tesseract_config} -l {settings.languages}"
# Extract text with confidence
data = pytesseract.image_to_data(
data = pytesseract.image_to_data( # type: ignore
image, config=config, output_type=pytesseract.Output.DICT
)
# Process results
words: list[Any] = []
confidences: list[Any] = []
words: list[dict[str, Any]] = []
confidences: list[float] = []
for i in range(len(data["text"])):
if int(data["conf"][i]) > 0: # Valid confidence
@@ -449,13 +515,6 @@ async def _process_with_tesseract(image_data: bytes, page_num: int) -> dict[str,
"word_count": len(words),
}
except ImportError:
logger.error("pytesseract not available")
return {
"page": page_num,
"strategy": "tesseract",
"error": "pytesseract not available",
}
except Exception as e:
logger.error("Tesseract processing failed", page=page_num, error=str(e))
return {"page": page_num, "strategy": "tesseract", "error": str(e)}
@@ -482,6 +541,68 @@ async def _process_with_layoutlm(image_data: bytes, page_num: int) -> dict[str,
return {"page": page_num, "strategy": "layoutlm", "error": str(e)}
async def _process_with_vision(image_data: bytes, page_num: int) -> dict[str, Any]:
"""Process page with LLM vision OCR via shared OCRProcessor"""
try:
vp = vision_processor
if vp is None:
raise RuntimeError("Vision OCR processor not initialized")
# Persist the page image temporarily for the processor API
import tempfile
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
tmp.write(image_data)
tmp_path = tmp.name
try:
text = vp.process_image(
image_path=tmp_path,
format_type=settings.vision_format,
preprocess=settings.vision_preprocess,
language=settings.languages,
)
finally:
try:
os.remove(tmp_path)
except OSError:
pass
return {
"page": page_num,
"strategy": "vision",
"text": text if isinstance(text, str) else str(text),
"confidence": 0.0, # Not provided by LLM API
}
except Exception as e:
logger.error("Vision processing failed", page=page_num, error=str(e))
return {"page": page_num, "strategy": "vision", "error": str(e)}
def _schedule_process_document_async(
doc_id: str,
tenant_id: str,
content: bytes,
strategy: str,
processing_id: str,
actor: str,
) -> None:
"""Sync wrapper to schedule the async OCR task.
This keeps FastAPI BackgroundTasks type expectations satisfied under mypy strict.
"""
asyncio.create_task(
_process_document_async(
doc_id=doc_id,
tenant_id=tenant_id,
content=content,
strategy=strategy,
processing_id=processing_id,
actor=actor,
)
)
@app.exception_handler(HTTPException)
async def http_exception_handler(request: Request, exc: HTTPException) -> JSONResponse:
"""Handle HTTP exceptions with RFC7807 format"""