Some checks failed
CI/CD Pipeline / Code Quality & Linting (push) Has been cancelled
CI/CD Pipeline / Policy Validation (push) Has been cancelled
CI/CD Pipeline / Test Suite (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-firm-connectors) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-forms) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-hmrc) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ingestion) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-normalize-map) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ocr) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-indexer) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-reason) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rpa) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (ui-review) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (ui-review) (push) Has been cancelled
CI/CD Pipeline / Generate SBOM (push) Has been cancelled
CI/CD Pipeline / Deploy to Staging (push) Has been cancelled
CI/CD Pipeline / Deploy to Production (push) Has been cancelled
CI/CD Pipeline / Notifications (push) Has been cancelled
626 lines
20 KiB
Python
626 lines
20 KiB
Python
# FILE: apps/svc-ocr/main.py
|
|
# OCR and layout extraction using Tesseract, LayoutLM, and document AI
|
|
|
|
import asyncio
|
|
import io
|
|
import os
|
|
|
|
# Import shared libraries
|
|
import sys
|
|
from datetime import datetime
|
|
from typing import Any, cast
|
|
|
|
import pytesseract
|
|
import structlog
|
|
import ulid
|
|
from fastapi import BackgroundTasks, Depends, HTTPException, Request
|
|
from fastapi.responses import JSONResponse
|
|
from pdf2image import convert_from_bytes
|
|
from PIL import Image
|
|
from PyPDF2 import PdfReader
|
|
|
|
sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
|
|
|
|
from libs.app_factory import create_app
|
|
from libs.config import BaseAppSettings, create_event_bus, create_minio_client
|
|
from libs.events import EventBus, EventPayload, EventTopics
|
|
from libs.observability import get_metrics, get_tracer, setup_observability
|
|
from libs.ocr.processor import OCRProcessor
|
|
from libs.schemas import ErrorResponse
|
|
from libs.security import get_current_user, get_tenant_id
|
|
from libs.storage import DocumentStorage, StorageClient
|
|
|
|
logger = structlog.get_logger()
|
|
|
|
|
|
class OCRSettings(BaseAppSettings):
|
|
"""Settings for OCR service"""
|
|
|
|
service_name: str = "svc-ocr"
|
|
|
|
# OCR configuration
|
|
tesseract_cmd: str = "/usr/bin/tesseract"
|
|
tesseract_config: str = "--oem 3 --psm 6"
|
|
languages: str = "eng"
|
|
|
|
# Layout analysis
|
|
layoutlm_model: str = "microsoft/layoutlm-base-uncased"
|
|
confidence_threshold: float = 0.7
|
|
|
|
# Processing limits
|
|
max_pages: int = 50
|
|
max_file_size: int = 100 * 1024 * 1024 # 100MB
|
|
|
|
# Output configuration
|
|
include_coordinates: bool = True
|
|
include_confidence: bool = True
|
|
|
|
# Vision/LLM OCR configuration
|
|
vision_provider: str = "ollama" # or "openai"
|
|
vision_model: str = "llama3.2-vision:11b"
|
|
vision_format: str = (
|
|
"text" # text | markdown | json | table | key_value | structured
|
|
)
|
|
vision_preprocess: bool = True
|
|
openai_base_url: str = "https://api.openai.com/v1/chat/completions"
|
|
|
|
|
|
# Global clients
|
|
storage_client: StorageClient | None = None
|
|
document_storage: DocumentStorage | None = None
|
|
event_bus: EventBus | None = None
|
|
|
|
vision_processor: OCRProcessor | None = None
|
|
# Settings will be initialized after app creation
|
|
settings: OCRSettings
|
|
|
|
|
|
async def init_dependencies(app_settings: OCRSettings) -> None:
|
|
"""Initialize service dependencies"""
|
|
global storage_client, document_storage, event_bus, settings, vision_processor
|
|
|
|
settings = app_settings
|
|
logger.info("Starting OCR service")
|
|
|
|
# Setup observability
|
|
setup_observability(settings)
|
|
|
|
# Initialize MinIO client
|
|
minio_client = create_minio_client(settings)
|
|
storage_client = StorageClient(minio_client)
|
|
document_storage = DocumentStorage(storage_client)
|
|
# Initialize event bus
|
|
event_bus = create_event_bus(settings)
|
|
if not event_bus:
|
|
raise HTTPException(status_code=500, detail="Event bus not initialized")
|
|
|
|
eb = event_bus
|
|
# mypy: event_bus is Optional, so use local alias after check
|
|
await eb.start()
|
|
|
|
# Subscribe to document ingestion events
|
|
await eb.subscribe(EventTopics.DOC_INGESTED, _handle_document_ingested)
|
|
|
|
# Initialize shared OCRProcessor for vision strategy
|
|
try:
|
|
vision_processor = OCRProcessor(
|
|
model_name=settings.vision_model,
|
|
provider=settings.vision_provider,
|
|
openai_base_url=settings.openai_base_url,
|
|
)
|
|
except Exception as e:
|
|
logger.error("Failed to initialize vision OCR processor", error=str(e))
|
|
|
|
logger.info("OCR service started successfully")
|
|
|
|
|
|
# Create app and settings
|
|
app, _settings = create_app(
|
|
service_name="svc-ocr",
|
|
title="Tax Agent OCR Service",
|
|
description="OCR and layout extraction service",
|
|
settings_class=OCRSettings,
|
|
) # fmt: skip
|
|
|
|
# Initialize dependencies immediately
|
|
asyncio.run(init_dependencies(cast(OCRSettings, _settings)))
|
|
|
|
tracer = get_tracer("svc-ocr")
|
|
metrics = get_metrics()
|
|
|
|
|
|
@app.post("/process/{doc_id}")
|
|
async def process_document(
|
|
doc_id: str,
|
|
background_tasks: BackgroundTasks,
|
|
strategy: str = "hybrid",
|
|
current_user: dict[str, Any] = Depends(get_current_user),
|
|
tenant_id: str = Depends(get_tenant_id),
|
|
) -> dict[str, Any]:
|
|
"""Process document with OCR"""
|
|
|
|
with tracer.start_as_current_span("process_document") as span:
|
|
span.set_attribute("doc_id", doc_id)
|
|
span.set_attribute("tenant_id", tenant_id)
|
|
span.set_attribute("strategy", strategy)
|
|
|
|
ds = document_storage
|
|
if ds is None:
|
|
raise HTTPException(
|
|
status_code=500, detail="Document storage not initialized"
|
|
)
|
|
try:
|
|
# Check if document exists
|
|
doc_content = await ds.get_document(tenant_id, doc_id)
|
|
if not doc_content:
|
|
raise HTTPException(status_code=404, detail="Document not found")
|
|
|
|
# Generate processing ID
|
|
processing_id = str(ulid.new())
|
|
span.set_attribute("processing_id", processing_id)
|
|
|
|
# Start background processing via sync wrapper (for mypy correctness)
|
|
background_tasks.add_task(
|
|
_schedule_process_document_async,
|
|
doc_id,
|
|
tenant_id,
|
|
doc_content,
|
|
strategy,
|
|
processing_id,
|
|
current_user.get("sub", "system"),
|
|
)
|
|
|
|
logger.info(
|
|
"OCR processing started", doc_id=doc_id, processing_id=processing_id
|
|
)
|
|
|
|
return {
|
|
"processing_id": processing_id,
|
|
"doc_id": doc_id,
|
|
"status": "processing",
|
|
"strategy": strategy,
|
|
}
|
|
|
|
except HTTPException:
|
|
raise
|
|
except Exception as e:
|
|
logger.error("Failed to start OCR processing", doc_id=doc_id, error=str(e))
|
|
raise HTTPException(
|
|
status_code=500, detail="Failed to start processing"
|
|
) from e
|
|
|
|
|
|
@app.get("/results/{doc_id}")
|
|
async def get_ocr_results(
|
|
doc_id: str,
|
|
current_user: dict[str, Any] = Depends(get_current_user),
|
|
tenant_id: str = Depends(get_tenant_id),
|
|
) -> dict[str, Any]:
|
|
"""Get OCR results for document"""
|
|
|
|
with tracer.start_as_current_span("get_ocr_results") as span:
|
|
span.set_attribute("doc_id", doc_id)
|
|
span.set_attribute("tenant_id", tenant_id)
|
|
|
|
ds = document_storage
|
|
if ds is None:
|
|
raise HTTPException(
|
|
status_code=500, detail="Document storage not initialized"
|
|
)
|
|
try:
|
|
# Get OCR results from storage
|
|
ocr_results = await ds.get_ocr_result(tenant_id, doc_id)
|
|
|
|
if not ocr_results:
|
|
raise HTTPException(status_code=404, detail="OCR results not found")
|
|
|
|
return ocr_results
|
|
|
|
except HTTPException:
|
|
raise
|
|
except Exception as e:
|
|
logger.error("Failed to get OCR results", doc_id=doc_id, error=str(e))
|
|
raise HTTPException(
|
|
status_code=500, detail="Failed to get OCR results"
|
|
) from e
|
|
|
|
|
|
async def _handle_document_ingested(topic: str, payload: EventPayload) -> None:
|
|
"""Handle document ingestion events"""
|
|
data = payload.data
|
|
doc_id = data.get("doc_id")
|
|
tenant_id = data.get("tenant_id")
|
|
|
|
if not doc_id or not tenant_id:
|
|
logger.warning("Invalid document ingestion event", data=data)
|
|
return
|
|
ds = document_storage
|
|
if ds is None:
|
|
logger.error("Document storage not initialized")
|
|
return
|
|
|
|
# Auto-process PDF documents
|
|
if data.get("content_type") == "application/pdf":
|
|
logger.info("Auto-processing ingested document", doc_id=doc_id)
|
|
|
|
try:
|
|
# Get document content
|
|
doc_content = await ds.get_document(tenant_id, doc_id)
|
|
if doc_content:
|
|
await _process_document_async(
|
|
doc_id=doc_id,
|
|
tenant_id=tenant_id,
|
|
content=doc_content,
|
|
strategy="hybrid",
|
|
processing_id=str(ulid.new()),
|
|
actor=payload.actor,
|
|
)
|
|
except Exception as e:
|
|
logger.error(
|
|
"Failed to handle document ingestion", doc_id=doc_id, error=str(e)
|
|
)
|
|
|
|
|
|
async def _process_document_async(
|
|
doc_id: str,
|
|
tenant_id: str,
|
|
content: bytes,
|
|
strategy: str,
|
|
processing_id: str,
|
|
actor: str,
|
|
) -> None:
|
|
"""Process document asynchronously"""
|
|
|
|
with tracer.start_as_current_span("process_document_async") as span:
|
|
span.set_attribute("doc_id", doc_id)
|
|
span.set_attribute("processing_id", processing_id)
|
|
span.set_attribute("strategy", strategy)
|
|
|
|
try:
|
|
# Convert PDF to images
|
|
images = await _pdf_to_images(content)
|
|
|
|
# Process each page
|
|
pages_data: list[dict[str, Any]] = []
|
|
for page_num, image in enumerate(images, 0):
|
|
page_data = await _process_page(image, page_num, strategy)
|
|
pages_data.append(page_data)
|
|
|
|
# Combine results
|
|
ocr_results = {
|
|
"doc_id": doc_id,
|
|
"processing_id": processing_id,
|
|
"strategy": strategy,
|
|
"processed_at": datetime.utcnow().isoformat(),
|
|
"total_pages": len(pages_data),
|
|
"pages": pages_data,
|
|
"metadata": {
|
|
"confidence_threshold": settings.confidence_threshold,
|
|
"languages": settings.languages,
|
|
},
|
|
}
|
|
|
|
# Store results
|
|
ds = document_storage
|
|
if ds is None:
|
|
raise RuntimeError("Document storage not initialized")
|
|
await ds.store_ocr_result(tenant_id, doc_id, ocr_results)
|
|
|
|
# Update metrics
|
|
metrics.counter("documents_processed_total").labels(
|
|
tenant_id=tenant_id, strategy=strategy
|
|
).inc()
|
|
|
|
metrics.histogram("processing_duration_seconds").labels(
|
|
strategy=strategy
|
|
).observe(
|
|
datetime.utcnow().timestamp()
|
|
- datetime.fromisoformat(
|
|
ocr_results["processed_at"].replace("Z", "") # type: ignore
|
|
).timestamp()
|
|
)
|
|
|
|
# Publish completion event
|
|
event_payload = EventPayload(
|
|
data={
|
|
"doc_id": doc_id,
|
|
"tenant_id": tenant_id,
|
|
"processing_id": processing_id,
|
|
"strategy": strategy,
|
|
"total_pages": len(pages_data),
|
|
"ocr_results": ocr_results,
|
|
},
|
|
actor=actor,
|
|
tenant_id=tenant_id,
|
|
)
|
|
|
|
eb = event_bus
|
|
if eb is not None:
|
|
await eb.publish(EventTopics.DOC_OCR_READY, event_payload)
|
|
|
|
logger.info(
|
|
"OCR processing completed", doc_id=doc_id, pages=len(pages_data)
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error("OCR processing failed", doc_id=doc_id, error=str(e))
|
|
|
|
# Update error metrics
|
|
metrics.counter("processing_errors_total").labels(
|
|
tenant_id=tenant_id, strategy=strategy, error_type=type(e).__name__
|
|
).inc()
|
|
|
|
|
|
async def _pdf_to_images(pdf_content: bytes) -> list[bytes]:
|
|
"""Convert PDF to page images without PyMuPDF.
|
|
|
|
Primary: pdf2image (requires poppler). Fallback: extract largest embedded image per page via PyPDF2/Pillow.
|
|
"""
|
|
# First try pdf2image for full-page rasterization
|
|
try:
|
|
images = convert_from_bytes(
|
|
pdf_content, dpi=200, first_page=1, last_page=settings.max_pages
|
|
)
|
|
image_bytes: list[bytes] = []
|
|
for img in images:
|
|
img_buffer = io.BytesIO()
|
|
img.save(img_buffer, format="PNG")
|
|
image_bytes.append(img_buffer.getvalue())
|
|
return image_bytes
|
|
except Exception as e:
|
|
logger.warning(
|
|
"pdf2image conversion failed; falling back to PyPDF2", error=str(e)
|
|
)
|
|
|
|
# Fallback: extract largest embedded image per page using PyPDF2
|
|
try:
|
|
reader = PdfReader(io.BytesIO(pdf_content))
|
|
out_images: list[bytes] = []
|
|
for page_index, page in enumerate(reader.pages):
|
|
if page_index >= settings.max_pages:
|
|
break
|
|
try:
|
|
resources = page.get("/Resources")
|
|
if resources is None:
|
|
continue
|
|
xobject = resources.get("/XObject")
|
|
if xobject is None:
|
|
continue
|
|
xobject = xobject.get_object()
|
|
|
|
largest = None
|
|
largest_area = -1
|
|
for _, obj_ref in xobject.items():
|
|
try:
|
|
obj = obj_ref.get_object()
|
|
if obj.get("/Subtype") != "/Image":
|
|
continue
|
|
width = int(obj.get("/Width", 0))
|
|
height = int(obj.get("/Height", 0))
|
|
area = width * height
|
|
if area > largest_area:
|
|
largest = obj
|
|
largest_area = area
|
|
except Exception:
|
|
continue
|
|
|
|
if largest is None:
|
|
continue
|
|
|
|
data = largest.get_data()
|
|
filt = largest.get("/Filter")
|
|
|
|
if filt in ("/DCTDecode", "/JPXDecode"):
|
|
# JPEG or JPEG2000
|
|
out_images.append(data)
|
|
else:
|
|
# Flate or other; decode via Pillow
|
|
mode = "RGB"
|
|
colorspace = largest.get("/ColorSpace")
|
|
if colorspace in ("/DeviceGray",):
|
|
mode = "L"
|
|
width = int(largest.get("/Width", 0))
|
|
height = int(largest.get("/Height", 0))
|
|
try:
|
|
img = Image.frombytes(mode, (width, height), data)
|
|
except Exception:
|
|
img = Image.open(io.BytesIO(data))
|
|
buf = io.BytesIO()
|
|
img.save(buf, format="PNG")
|
|
out_images.append(buf.getvalue())
|
|
except Exception:
|
|
continue
|
|
|
|
if not out_images:
|
|
raise RuntimeError("No images extracted via PyPDF2 fallback")
|
|
return out_images
|
|
except Exception as fallback_e:
|
|
logger.error("PDF conversion failed (both methods)", error=str(fallback_e))
|
|
raise
|
|
|
|
|
|
async def _process_page(
|
|
image_data: bytes, page_num: int, strategy: str
|
|
) -> dict[str, Any]:
|
|
"""Process single page with OCR"""
|
|
|
|
if strategy == "tesseract":
|
|
return await _process_with_tesseract(image_data, page_num)
|
|
elif strategy == "layoutlm":
|
|
return await _process_with_layoutlm(image_data, page_num)
|
|
elif strategy == "hybrid":
|
|
# Combine both approaches
|
|
tesseract_result = await _process_with_tesseract(image_data, page_num)
|
|
layoutlm_result = await _process_with_layoutlm(image_data, page_num)
|
|
|
|
return {
|
|
"page": page_num,
|
|
"strategy": "hybrid",
|
|
"tesseract": tesseract_result,
|
|
"layoutlm": layoutlm_result,
|
|
"text": tesseract_result.get("text", ""),
|
|
"confidence": max(
|
|
tesseract_result.get("confidence", 0),
|
|
layoutlm_result.get("confidence", 0),
|
|
),
|
|
}
|
|
elif strategy == "vision":
|
|
return await _process_with_vision(image_data, page_num)
|
|
else:
|
|
raise ValueError(f"Unknown strategy: {strategy}")
|
|
|
|
|
|
async def _process_with_tesseract(image_data: bytes, page_num: int) -> dict[str, Any]:
|
|
"""Process page with Tesseract OCR"""
|
|
try:
|
|
# Load image
|
|
image = Image.open(io.BytesIO(image_data))
|
|
|
|
# Configure Tesseract
|
|
config = f"{settings.tesseract_config} -l {settings.languages}"
|
|
|
|
# Extract text with confidence
|
|
data = pytesseract.image_to_data( # type: ignore
|
|
image, config=config, output_type=pytesseract.Output.DICT
|
|
)
|
|
|
|
# Process results
|
|
words: list[dict[str, Any]] = []
|
|
confidences: list[float] = []
|
|
|
|
for i in range(len(data["text"])):
|
|
if int(data["conf"][i]) > 0: # Valid confidence
|
|
word_data = {
|
|
"text": data["text"][i],
|
|
"confidence": int(data["conf"][i]) / 100.0,
|
|
"bbox": [
|
|
data["left"][i],
|
|
data["top"][i],
|
|
data["left"][i] + data["width"][i],
|
|
data["top"][i] + data["height"][i],
|
|
],
|
|
}
|
|
words.append(word_data)
|
|
confidences.append(word_data["confidence"])
|
|
|
|
# Extract full text
|
|
full_text = pytesseract.image_to_string(image, config=config)
|
|
|
|
return {
|
|
"page": page_num,
|
|
"strategy": "tesseract",
|
|
"text": full_text.strip(),
|
|
"words": words,
|
|
"confidence": sum(confidences) / len(confidences) if confidences else 0.0,
|
|
"word_count": len(words),
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error("Tesseract processing failed", page=page_num, error=str(e))
|
|
return {"page": page_num, "strategy": "tesseract", "error": str(e)}
|
|
|
|
|
|
async def _process_with_layoutlm(image_data: bytes, page_num: int) -> dict[str, Any]:
|
|
"""Process page with LayoutLM"""
|
|
try:
|
|
# This would integrate with LayoutLM model
|
|
# For now, return placeholder
|
|
logger.warning("LayoutLM processing not implemented")
|
|
|
|
return {
|
|
"page": page_num,
|
|
"strategy": "layoutlm",
|
|
"text": "",
|
|
"layout_elements": [],
|
|
"confidence": 0.0,
|
|
"error": "Not implemented",
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error("LayoutLM processing failed", page=page_num, error=str(e))
|
|
return {"page": page_num, "strategy": "layoutlm", "error": str(e)}
|
|
|
|
|
|
async def _process_with_vision(image_data: bytes, page_num: int) -> dict[str, Any]:
|
|
"""Process page with LLM vision OCR via shared OCRProcessor"""
|
|
try:
|
|
vp = vision_processor
|
|
if vp is None:
|
|
raise RuntimeError("Vision OCR processor not initialized")
|
|
|
|
# Persist the page image temporarily for the processor API
|
|
import tempfile
|
|
|
|
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
|
|
tmp.write(image_data)
|
|
tmp_path = tmp.name
|
|
|
|
try:
|
|
text = vp.process_image(
|
|
image_path=tmp_path,
|
|
format_type=settings.vision_format,
|
|
preprocess=settings.vision_preprocess,
|
|
language=settings.languages,
|
|
)
|
|
finally:
|
|
try:
|
|
os.remove(tmp_path)
|
|
except OSError:
|
|
pass
|
|
|
|
return {
|
|
"page": page_num,
|
|
"strategy": "vision",
|
|
"text": text if isinstance(text, str) else str(text),
|
|
"confidence": 0.0, # Not provided by LLM API
|
|
}
|
|
except Exception as e:
|
|
logger.error("Vision processing failed", page=page_num, error=str(e))
|
|
return {"page": page_num, "strategy": "vision", "error": str(e)}
|
|
|
|
|
|
def _schedule_process_document_async(
|
|
doc_id: str,
|
|
tenant_id: str,
|
|
content: bytes,
|
|
strategy: str,
|
|
processing_id: str,
|
|
actor: str,
|
|
) -> None:
|
|
"""Sync wrapper to schedule the async OCR task.
|
|
|
|
This keeps FastAPI BackgroundTasks type expectations satisfied under mypy strict.
|
|
"""
|
|
asyncio.create_task(
|
|
_process_document_async(
|
|
doc_id=doc_id,
|
|
tenant_id=tenant_id,
|
|
content=content,
|
|
strategy=strategy,
|
|
processing_id=processing_id,
|
|
actor=actor,
|
|
)
|
|
)
|
|
|
|
|
|
@app.exception_handler(HTTPException)
|
|
async def http_exception_handler(request: Request, exc: HTTPException) -> JSONResponse:
|
|
"""Handle HTTP exceptions with RFC7807 format"""
|
|
return JSONResponse(
|
|
status_code=exc.status_code,
|
|
content=ErrorResponse(
|
|
type=f"https://httpstatuses.com/{exc.status_code}",
|
|
title=exc.detail,
|
|
status=exc.status_code,
|
|
detail=exc.detail,
|
|
instance=str(request.url),
|
|
trace_id="",
|
|
).model_dump(),
|
|
)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import uvicorn
|
|
|
|
uvicorn.run("main:app", host="0.0.0.0", port=8002, reload=True, log_config=None)
|