Files
ai-tax-agent/apps/svc_ocr/main.py
harkon fdba81809f
Some checks failed
CI/CD Pipeline / Generate SBOM (push) Has been cancelled
CI/CD Pipeline / Deploy to Staging (push) Has been cancelled
CI/CD Pipeline / Deploy to Production (push) Has been cancelled
CI/CD Pipeline / Code Quality & Linting (push) Has been cancelled
CI/CD Pipeline / Policy Validation (push) Has been cancelled
CI/CD Pipeline / Test Suite (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-firm-connectors) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-forms) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-hmrc) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ingestion) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-normalize-map) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ocr) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-indexer) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-reason) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rpa) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (ui-review) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (ui-review) (push) Has been cancelled
CI/CD Pipeline / Notifications (push) Has been cancelled
completed local setup with compose
2025-11-26 13:17:17 +00:00

666 lines
22 KiB
Python

# FILE: apps/svc-ocr/main.py
# OCR and layout extraction using Tesseract, LayoutLM, and document AI
import asyncio
import io
import os
# Import shared libraries
import sys
from contextlib import asynccontextmanager
from datetime import datetime
from typing import Any, cast
import pytesseract
import structlog
import ulid
from fastapi import BackgroundTasks, Depends, FastAPI, HTTPException, Request
from fastapi.responses import JSONResponse
from pdf2image import convert_from_bytes
from PIL import Image
from PyPDF2 import PdfReader
sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
from libs.app_factory import create_app
from libs.config import BaseAppSettings, create_event_bus, create_minio_client
from libs.events import EventBus, EventPayload, EventTopics
from libs.observability import get_metrics, get_tracer, setup_observability
from libs.ocr.processor import OCRProcessor
from libs.schemas import ErrorResponse
from libs.security import get_current_user, get_tenant_id
from libs.storage import DocumentStorage, StorageClient
logger = structlog.get_logger()
class OCRSettings(BaseAppSettings):
"""Settings for OCR service"""
service_name: str = "svc-ocr"
# OCR configuration
tesseract_cmd: str = "/usr/bin/tesseract"
tesseract_config: str = "--oem 3 --psm 6"
languages: str = "eng"
# Layout analysis
layoutlm_model: str = "microsoft/layoutlm-base-uncased"
confidence_threshold: float = 0.7
# Processing limits
max_pages: int = 50
max_file_size: int = 100 * 1024 * 1024 # 100MB
# Output configuration
include_coordinates: bool = True
include_confidence: bool = True
# Vision/LLM OCR configuration
vision_provider: str = "ollama" # or "openai"
vision_model: str = "llama3.2-vision:11b"
vision_format: str = (
"text" # text | markdown | json | table | key_value | structured
)
vision_preprocess: bool = True
openai_base_url: str = "https://api.openai.com/v1/chat/completions"
# Global clients
storage_client: StorageClient | None = None
document_storage: DocumentStorage | None = None
event_bus: EventBus | None = None
vision_processor: OCRProcessor | None = None
# Settings will be initialized after app creation
settings: OCRSettings
async def init_dependencies(app_settings: OCRSettings) -> None:
"""Initialize service dependencies"""
global storage_client, document_storage, event_bus, settings, vision_processor
# Larger delay to ensure NATS is fully ready before attempting connection
await asyncio.sleep(10)
settings = app_settings
logger.info("Starting OCR service")
# Setup observability
setup_observability(settings)
# Initialize MinIO client
minio_client = create_minio_client(settings)
storage_client = StorageClient(minio_client)
document_storage = DocumentStorage(storage_client)
# Initialize event bus with retry logic
max_retries = 20
delay = 5
for attempt in range(1, max_retries + 1):
logger.info(
"Attempting NATS connection", url=settings.nats_servers, attempt=attempt
)
event_bus = create_event_bus(settings)
if not event_bus:
raise HTTPException(status_code=500, detail="Event bus not initialized")
eb = event_bus
try:
# Attempt to start and subscribe
await eb.start()
await eb.subscribe(EventTopics.DOC_INGESTED, _handle_document_ingested)
logger.info("NATS connection established on attempt", attempt=attempt)
break
except Exception as e:
logger.error(
"Failed to connect to NATS, retrying",
attempt=attempt,
error=str(e),
)
if attempt == max_retries:
raise HTTPException(
status_code=500, detail="Failed to connect to NATS after retries"
)
await asyncio.sleep(delay)
delay *= 2 # exponential backoff
# Initialize shared OCRProcessor for vision strategy
try:
vision_processor = OCRProcessor(
model_name=settings.vision_model,
provider=settings.vision_provider,
openai_base_url=settings.openai_base_url,
)
except Exception as e:
logger.error("Failed to initialize vision OCR processor", error=str(e))
logger.info("OCR service started successfully")
async def shutdown_dependencies() -> None:
"""Shutdown service dependencies"""
logger.info("Shutting down OCR service")
eb = event_bus
if eb is not None:
await eb.stop()
logger.info("OCR service shutdown complete")
@asynccontextmanager
async def lifespan(app: FastAPI): # type: ignore
"""FastAPI lifespan event handler"""
# Startup
await init_dependencies(cast(OCRSettings, _settings))
yield
# Shutdown
await shutdown_dependencies()
# Create app and settings with lifespan
app, _settings = create_app(
service_name="svc-ocr",
title="Tax Agent OCR Service",
description="OCR and layout extraction service",
settings_class=OCRSettings,
) # fmt: skip
# Override app's lifespan
app.router.lifespan_context = lifespan
tracer = get_tracer("svc-ocr")
metrics = get_metrics()
@app.post("/process/{doc_id}")
async def process_document(
doc_id: str,
background_tasks: BackgroundTasks,
strategy: str = "hybrid",
current_user: dict[str, Any] = Depends(get_current_user),
tenant_id: str = Depends(get_tenant_id),
) -> dict[str, Any]:
"""Process document with OCR"""
with tracer.start_as_current_span("process_document") as span:
span.set_attribute("doc_id", doc_id)
span.set_attribute("tenant_id", tenant_id)
span.set_attribute("strategy", strategy)
ds = document_storage
if ds is None:
raise HTTPException(
status_code=500, detail="Document storage not initialized"
)
try:
# Check if document exists
doc_content = await ds.get_document(tenant_id, doc_id)
if not doc_content:
raise HTTPException(status_code=404, detail="Document not found")
# Generate processing ID
processing_id = str(ulid.new())
span.set_attribute("processing_id", processing_id)
# Start background processing via sync wrapper (for mypy correctness)
background_tasks.add_task(
_schedule_process_document_async,
doc_id,
tenant_id,
doc_content,
strategy,
processing_id,
current_user.get("sub", "system"),
)
logger.info(
"OCR processing started", doc_id=doc_id, processing_id=processing_id
)
return {
"processing_id": processing_id,
"doc_id": doc_id,
"status": "processing",
"strategy": strategy,
}
except HTTPException:
raise
except Exception as e:
logger.error("Failed to start OCR processing", doc_id=doc_id, error=str(e))
raise HTTPException(
status_code=500, detail="Failed to start processing"
) from e
@app.get("/results/{doc_id}")
async def get_ocr_results(
doc_id: str,
current_user: dict[str, Any] = Depends(get_current_user),
tenant_id: str = Depends(get_tenant_id),
) -> dict[str, Any]:
"""Get OCR results for document"""
with tracer.start_as_current_span("get_ocr_results") as span:
span.set_attribute("doc_id", doc_id)
span.set_attribute("tenant_id", tenant_id)
ds = document_storage
if ds is None:
raise HTTPException(
status_code=500, detail="Document storage not initialized"
)
try:
# Get OCR results from storage
ocr_results = await ds.get_ocr_result(tenant_id, doc_id)
if not ocr_results:
raise HTTPException(status_code=404, detail="OCR results not found")
return ocr_results
except HTTPException:
raise
except Exception as e:
logger.error("Failed to get OCR results", doc_id=doc_id, error=str(e))
raise HTTPException(
status_code=500, detail="Failed to get OCR results"
) from e
async def _handle_document_ingested(topic: str, payload: EventPayload) -> None:
"""Handle document ingestion events"""
data = payload.data
doc_id = data.get("doc_id")
tenant_id = data.get("tenant_id")
if not doc_id or not tenant_id:
logger.warning("Invalid document ingestion event", data=data)
return
ds = document_storage
if ds is None:
logger.error("Document storage not initialized")
return
# Auto-process PDF documents
if data.get("content_type") == "application/pdf":
logger.info("Auto-processing ingested document", doc_id=doc_id)
try:
# Get document content
doc_content = await ds.get_document(tenant_id, doc_id)
if doc_content:
await _process_document_async(
doc_id=doc_id,
tenant_id=tenant_id,
content=doc_content,
strategy="hybrid",
processing_id=str(ulid.new()),
actor=payload.actor,
)
except Exception as e:
logger.error(
"Failed to handle document ingestion", doc_id=doc_id, error=str(e)
)
async def _process_document_async(
doc_id: str,
tenant_id: str,
content: bytes,
strategy: str,
processing_id: str,
actor: str,
) -> None:
"""Process document asynchronously"""
with tracer.start_as_current_span("process_document_async") as span:
span.set_attribute("doc_id", doc_id)
span.set_attribute("processing_id", processing_id)
span.set_attribute("strategy", strategy)
try:
# Convert PDF to images
images = await _pdf_to_images(content)
# Process each page
pages_data: list[dict[str, Any]] = []
for page_num, image in enumerate(images, 0):
page_data = await _process_page(image, page_num, strategy)
pages_data.append(page_data)
# Combine results
ocr_results = {
"doc_id": doc_id,
"processing_id": processing_id,
"strategy": strategy,
"processed_at": datetime.utcnow().isoformat(),
"total_pages": len(pages_data),
"pages": pages_data,
"metadata": {
"confidence_threshold": settings.confidence_threshold,
"languages": settings.languages,
},
}
# Store results
ds = document_storage
if ds is None:
raise RuntimeError("Document storage not initialized")
await ds.store_ocr_result(tenant_id, doc_id, ocr_results)
# Update metrics
metrics.counter("documents_processed_total").labels(
tenant_id=tenant_id, strategy=strategy
).inc()
metrics.histogram("processing_duration_seconds").labels(
strategy=strategy
).observe(
datetime.utcnow().timestamp()
- datetime.fromisoformat(
ocr_results["processed_at"].replace("Z", "") # type: ignore
).timestamp()
)
# Publish completion event
event_payload = EventPayload(
data={
"doc_id": doc_id,
"tenant_id": tenant_id,
"processing_id": processing_id,
"strategy": strategy,
"total_pages": len(pages_data),
"ocr_results": ocr_results,
},
actor=actor,
tenant_id=tenant_id,
)
eb = event_bus
if eb is not None:
await eb.publish(EventTopics.DOC_OCR_READY, event_payload)
logger.info(
"OCR processing completed", doc_id=doc_id, pages=len(pages_data)
)
except Exception as e:
logger.error("OCR processing failed", doc_id=doc_id, error=str(e))
# Update error metrics
metrics.counter("processing_errors_total").labels(
tenant_id=tenant_id, strategy=strategy, error_type=type(e).__name__
).inc()
async def _pdf_to_images(pdf_content: bytes) -> list[bytes]:
"""Convert PDF to page images without PyMuPDF.
Primary: pdf2image (requires poppler). Fallback: extract largest embedded image per page via PyPDF2/Pillow.
"""
# First try pdf2image for full-page rasterization
try:
images = convert_from_bytes(
pdf_content, dpi=200, first_page=1, last_page=settings.max_pages
)
image_bytes: list[bytes] = []
for img in images:
img_buffer = io.BytesIO()
img.save(img_buffer, format="PNG")
image_bytes.append(img_buffer.getvalue())
return image_bytes
except Exception as e:
logger.warning(
"pdf2image conversion failed; falling back to PyPDF2", error=str(e)
)
# Fallback: extract largest embedded image per page using PyPDF2
try:
reader = PdfReader(io.BytesIO(pdf_content))
out_images: list[bytes] = []
for page_index, page in enumerate(reader.pages):
if page_index >= settings.max_pages:
break
try:
resources = page.get("/Resources")
if resources is None:
continue
xobject = resources.get("/XObject")
if xobject is None:
continue
xobject = xobject.get_object()
largest = None
largest_area = -1
for _, obj_ref in xobject.items():
try:
obj = obj_ref.get_object()
if obj.get("/Subtype") != "/Image":
continue
width = int(obj.get("/Width", 0))
height = int(obj.get("/Height", 0))
area = width * height
if area > largest_area:
largest = obj
largest_area = area
except Exception:
continue
if largest is None:
continue
data = largest.get_data()
filt = largest.get("/Filter")
if filt in ("/DCTDecode", "/JPXDecode"):
# JPEG or JPEG2000
out_images.append(data)
else:
# Flate or other; decode via Pillow
mode = "RGB"
colorspace = largest.get("/ColorSpace")
if colorspace in ("/DeviceGray",):
mode = "L"
width = int(largest.get("/Width", 0))
height = int(largest.get("/Height", 0))
try:
img = Image.frombytes(mode, (width, height), data)
except Exception:
img = Image.open(io.BytesIO(data))
buf = io.BytesIO()
img.save(buf, format="PNG")
out_images.append(buf.getvalue())
except Exception:
continue
if not out_images:
raise RuntimeError("No images extracted via PyPDF2 fallback")
return out_images
except Exception as fallback_e:
logger.error("PDF conversion failed (both methods)", error=str(fallback_e))
raise
async def _process_page(
image_data: bytes, page_num: int, strategy: str
) -> dict[str, Any]:
"""Process single page with OCR"""
if strategy == "tesseract":
return await _process_with_tesseract(image_data, page_num)
elif strategy == "layoutlm":
return await _process_with_layoutlm(image_data, page_num)
elif strategy == "hybrid":
# Combine both approaches
tesseract_result = await _process_with_tesseract(image_data, page_num)
layoutlm_result = await _process_with_layoutlm(image_data, page_num)
return {
"page": page_num,
"strategy": "hybrid",
"tesseract": tesseract_result,
"layoutlm": layoutlm_result,
"text": tesseract_result.get("text", ""),
"confidence": max(
tesseract_result.get("confidence", 0),
layoutlm_result.get("confidence", 0),
),
}
elif strategy == "vision":
return await _process_with_vision(image_data, page_num)
else:
raise ValueError(f"Unknown strategy: {strategy}")
async def _process_with_tesseract(image_data: bytes, page_num: int) -> dict[str, Any]:
"""Process page with Tesseract OCR"""
try:
# Load image
image = Image.open(io.BytesIO(image_data))
# Configure Tesseract
config = f"{settings.tesseract_config} -l {settings.languages}"
# Extract text with confidence
data = pytesseract.image_to_data(
image, config=config, output_type=pytesseract.Output.DICT
)
# Process results
words: list[dict[str, Any]] = []
confidences: list[float] = []
for i in range(len(data["text"])):
if int(data["conf"][i]) > 0: # Valid confidence
word_data = {
"text": data["text"][i],
"confidence": int(data["conf"][i]) / 100.0,
"bbox": [
data["left"][i],
data["top"][i],
data["left"][i] + data["width"][i],
data["top"][i] + data["height"][i],
],
}
words.append(word_data)
confidences.append(word_data["confidence"])
# Extract full text
full_text = pytesseract.image_to_string(image, config=config)
return {
"page": page_num,
"strategy": "tesseract",
"text": full_text.strip(),
"words": words,
"confidence": sum(confidences) / len(confidences) if confidences else 0.0,
"word_count": len(words),
}
except Exception as e:
logger.error("Tesseract processing failed", page=page_num, error=str(e))
return {"page": page_num, "strategy": "tesseract", "error": str(e)}
async def _process_with_layoutlm(image_data: bytes, page_num: int) -> dict[str, Any]:
"""Process page with LayoutLM"""
try:
# This would integrate with LayoutLM model
# For now, return placeholder
logger.warning("LayoutLM processing not implemented")
return {
"page": page_num,
"strategy": "layoutlm",
"text": "",
"layout_elements": [],
"confidence": 0.0,
"error": "Not implemented",
}
except Exception as e:
logger.error("LayoutLM processing failed", page=page_num, error=str(e))
return {"page": page_num, "strategy": "layoutlm", "error": str(e)}
async def _process_with_vision(image_data: bytes, page_num: int) -> dict[str, Any]:
"""Process page with LLM vision OCR via shared OCRProcessor"""
try:
vp = vision_processor
if vp is None:
raise RuntimeError("Vision OCR processor not initialized")
# Persist the page image temporarily for the processor API
import tempfile
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
tmp.write(image_data)
tmp_path = tmp.name
try:
text = vp.process_image(
image_path=tmp_path,
format_type=settings.vision_format,
preprocess=settings.vision_preprocess,
language=settings.languages,
)
finally:
try:
os.remove(tmp_path)
except OSError:
pass
return {
"page": page_num,
"strategy": "vision",
"text": text if isinstance(text, str) else str(text),
"confidence": 0.0, # Not provided by LLM API
}
except Exception as e:
logger.error("Vision processing failed", page=page_num, error=str(e))
return {"page": page_num, "strategy": "vision", "error": str(e)}
def _schedule_process_document_async(
doc_id: str,
tenant_id: str,
content: bytes,
strategy: str,
processing_id: str,
actor: str,
) -> None:
"""Sync wrapper to schedule the async OCR task.
This keeps FastAPI BackgroundTasks type expectations satisfied under mypy strict.
"""
asyncio.create_task(
_process_document_async(
doc_id=doc_id,
tenant_id=tenant_id,
content=content,
strategy=strategy,
processing_id=processing_id,
actor=actor,
)
)
@app.exception_handler(HTTPException)
async def http_exception_handler(request: Request, exc: HTTPException) -> JSONResponse:
"""Handle HTTP exceptions with RFC7807 format"""
return JSONResponse(
status_code=exc.status_code,
content=ErrorResponse(
type=f"https://httpstatuses.com/{exc.status_code}",
title=exc.detail,
status=exc.status_code,
detail=exc.detail,
instance=str(request.url),
trace_id="",
).model_dump(),
)
if __name__ == "__main__":
import uvicorn
uvicorn.run("main:app", host="0.0.0.0", port=8002, reload=True, log_config=None)