deployment, linting and infra configuration
Some checks failed
CI/CD Pipeline / Code Quality & Linting (push) Has been cancelled
CI/CD Pipeline / Policy Validation (push) Has been cancelled
CI/CD Pipeline / Test Suite (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-firm-connectors) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-forms) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-hmrc) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ingestion) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-normalize-map) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ocr) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-indexer) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-reason) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rpa) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (ui-review) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (ui-review) (push) Has been cancelled
CI/CD Pipeline / Generate SBOM (push) Has been cancelled
CI/CD Pipeline / Deploy to Staging (push) Has been cancelled
CI/CD Pipeline / Deploy to Production (push) Has been cancelled
CI/CD Pipeline / Notifications (push) Has been cancelled
Some checks failed
CI/CD Pipeline / Code Quality & Linting (push) Has been cancelled
CI/CD Pipeline / Policy Validation (push) Has been cancelled
CI/CD Pipeline / Test Suite (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-firm-connectors) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-forms) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-hmrc) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ingestion) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-normalize-map) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ocr) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-indexer) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-reason) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rpa) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (ui-review) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (ui-review) (push) Has been cancelled
CI/CD Pipeline / Generate SBOM (push) Has been cancelled
CI/CD Pipeline / Deploy to Staging (push) Has been cancelled
CI/CD Pipeline / Deploy to Production (push) Has been cancelled
CI/CD Pipeline / Notifications (push) Has been cancelled
This commit is contained in:
@@ -1,17 +1,23 @@
|
||||
# FILE: apps/svc-ocr/main.py
|
||||
# OCR and layout extraction using Tesseract, LayoutLM, and document AI
|
||||
|
||||
import asyncio
|
||||
import io
|
||||
import os
|
||||
|
||||
# Import shared libraries
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from typing import Any
|
||||
from typing import Any, cast
|
||||
|
||||
import pytesseract
|
||||
import structlog
|
||||
import ulid
|
||||
from fastapi import BackgroundTasks, Depends, HTTPException, Request
|
||||
from fastapi.responses import JSONResponse
|
||||
from pdf2image import convert_from_bytes
|
||||
from PIL import Image
|
||||
from PyPDF2 import PdfReader
|
||||
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
|
||||
|
||||
@@ -19,6 +25,7 @@ from libs.app_factory import create_app
|
||||
from libs.config import BaseAppSettings, create_event_bus, create_minio_client
|
||||
from libs.events import EventBus, EventPayload, EventTopics
|
||||
from libs.observability import get_metrics, get_tracer, setup_observability
|
||||
from libs.ocr.processor import OCRProcessor
|
||||
from libs.schemas import ErrorResponse
|
||||
from libs.security import get_current_user, get_tenant_id
|
||||
from libs.storage import DocumentStorage, StorageClient
|
||||
@@ -48,28 +55,31 @@ class OCRSettings(BaseAppSettings):
|
||||
include_coordinates: bool = True
|
||||
include_confidence: bool = True
|
||||
|
||||
# Vision/LLM OCR configuration
|
||||
vision_provider: str = "ollama" # or "openai"
|
||||
vision_model: str = "llama3.2-vision:11b"
|
||||
vision_format: str = (
|
||||
"text" # text | markdown | json | table | key_value | structured
|
||||
)
|
||||
vision_preprocess: bool = True
|
||||
openai_base_url: str = "https://api.openai.com/v1/chat/completions"
|
||||
|
||||
# Create app and settings
|
||||
app, settings = create_app(
|
||||
service_name="svc-ocr",
|
||||
title="Tax Agent OCR Service",
|
||||
description="OCR and layout extraction service",
|
||||
settings_class=OCRSettings,
|
||||
) # fmt: skip
|
||||
|
||||
# Global clients
|
||||
storage_client: StorageClient | None = None
|
||||
document_storage: DocumentStorage | None = None
|
||||
event_bus: EventBus | None = None
|
||||
tracer = get_tracer("svc-ocr")
|
||||
metrics = get_metrics()
|
||||
|
||||
vision_processor: OCRProcessor | None = None
|
||||
# Settings will be initialized after app creation
|
||||
settings: OCRSettings
|
||||
|
||||
|
||||
@app.on_event("startup")
|
||||
async def startup_event() -> None:
|
||||
async def init_dependencies(app_settings: OCRSettings) -> None:
|
||||
"""Initialize service dependencies"""
|
||||
global storage_client, document_storage, event_bus
|
||||
global storage_client, document_storage, event_bus, settings, vision_processor
|
||||
|
||||
settings = app_settings
|
||||
logger.info("Starting OCR service")
|
||||
|
||||
# Setup observability
|
||||
@@ -79,42 +89,44 @@ async def startup_event() -> None:
|
||||
minio_client = create_minio_client(settings)
|
||||
storage_client = StorageClient(minio_client)
|
||||
document_storage = DocumentStorage(storage_client)
|
||||
|
||||
# Initialize event bus
|
||||
event_bus = create_event_bus(settings)
|
||||
if not event_bus:
|
||||
raise HTTPException(status_code=500, detail="Event bus not initialized")
|
||||
|
||||
await event_bus.start()
|
||||
eb = event_bus
|
||||
# mypy: event_bus is Optional, so use local alias after check
|
||||
await eb.start()
|
||||
|
||||
# Subscribe to document ingestion events
|
||||
await event_bus.subscribe(EventTopics.DOC_INGESTED, _handle_document_ingested)
|
||||
await eb.subscribe(EventTopics.DOC_INGESTED, _handle_document_ingested)
|
||||
|
||||
# Initialize shared OCRProcessor for vision strategy
|
||||
try:
|
||||
vision_processor = OCRProcessor(
|
||||
model_name=settings.vision_model,
|
||||
provider=settings.vision_provider,
|
||||
openai_base_url=settings.openai_base_url,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error("Failed to initialize vision OCR processor", error=str(e))
|
||||
|
||||
logger.info("OCR service started successfully")
|
||||
|
||||
|
||||
@app.on_event("shutdown")
|
||||
async def shutdown_event() -> None:
|
||||
"""Cleanup service dependencies"""
|
||||
global event_bus
|
||||
# Create app and settings
|
||||
app, _settings = create_app(
|
||||
service_name="svc-ocr",
|
||||
title="Tax Agent OCR Service",
|
||||
description="OCR and layout extraction service",
|
||||
settings_class=OCRSettings,
|
||||
) # fmt: skip
|
||||
|
||||
logger.info("Shutting down OCR service")
|
||||
# Initialize dependencies immediately
|
||||
asyncio.run(init_dependencies(cast(OCRSettings, _settings)))
|
||||
|
||||
if event_bus:
|
||||
await event_bus.stop()
|
||||
|
||||
logger.info("OCR service shutdown complete")
|
||||
|
||||
|
||||
@app.get("/health")
|
||||
async def health_check() -> dict[str, Any]:
|
||||
"""Health check endpoint"""
|
||||
return {
|
||||
"status": "healthy",
|
||||
"service": settings.service_name,
|
||||
"version": settings.service_version,
|
||||
"timestamp": datetime.utcnow().isoformat(),
|
||||
}
|
||||
tracer = get_tracer("svc-ocr")
|
||||
metrics = get_metrics()
|
||||
|
||||
|
||||
@app.post("/process/{doc_id}")
|
||||
@@ -132,9 +144,14 @@ async def process_document(
|
||||
span.set_attribute("tenant_id", tenant_id)
|
||||
span.set_attribute("strategy", strategy)
|
||||
|
||||
ds = document_storage
|
||||
if ds is None:
|
||||
raise HTTPException(
|
||||
status_code=500, detail="Document storage not initialized"
|
||||
)
|
||||
try:
|
||||
# Check if document exists
|
||||
doc_content = await document_storage.get_document(tenant_id, doc_id)
|
||||
doc_content = await ds.get_document(tenant_id, doc_id)
|
||||
if not doc_content:
|
||||
raise HTTPException(status_code=404, detail="Document not found")
|
||||
|
||||
@@ -142,9 +159,9 @@ async def process_document(
|
||||
processing_id = str(ulid.new())
|
||||
span.set_attribute("processing_id", processing_id)
|
||||
|
||||
# Start background processing
|
||||
# Start background processing via sync wrapper (for mypy correctness)
|
||||
background_tasks.add_task(
|
||||
_process_document_async,
|
||||
_schedule_process_document_async,
|
||||
doc_id,
|
||||
tenant_id,
|
||||
doc_content,
|
||||
@@ -168,7 +185,9 @@ async def process_document(
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error("Failed to start OCR processing", doc_id=doc_id, error=str(e))
|
||||
raise HTTPException(status_code=500, detail="Failed to start processing")
|
||||
raise HTTPException(
|
||||
status_code=500, detail="Failed to start processing"
|
||||
) from e
|
||||
|
||||
|
||||
@app.get("/results/{doc_id}")
|
||||
@@ -183,9 +202,14 @@ async def get_ocr_results(
|
||||
span.set_attribute("doc_id", doc_id)
|
||||
span.set_attribute("tenant_id", tenant_id)
|
||||
|
||||
ds = document_storage
|
||||
if ds is None:
|
||||
raise HTTPException(
|
||||
status_code=500, detail="Document storage not initialized"
|
||||
)
|
||||
try:
|
||||
# Get OCR results from storage
|
||||
ocr_results = await document_storage.get_ocr_result(tenant_id, doc_id)
|
||||
ocr_results = await ds.get_ocr_result(tenant_id, doc_id)
|
||||
|
||||
if not ocr_results:
|
||||
raise HTTPException(status_code=404, detail="OCR results not found")
|
||||
@@ -196,26 +220,32 @@ async def get_ocr_results(
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error("Failed to get OCR results", doc_id=doc_id, error=str(e))
|
||||
raise HTTPException(status_code=500, detail="Failed to get OCR results")
|
||||
raise HTTPException(
|
||||
status_code=500, detail="Failed to get OCR results"
|
||||
) from e
|
||||
|
||||
|
||||
async def _handle_document_ingested(topic: str, payload: EventPayload) -> None:
|
||||
"""Handle document ingestion events"""
|
||||
try:
|
||||
data = payload.data
|
||||
doc_id = data.get("doc_id")
|
||||
tenant_id = data.get("tenant_id")
|
||||
data = payload.data
|
||||
doc_id = data.get("doc_id")
|
||||
tenant_id = data.get("tenant_id")
|
||||
|
||||
if not doc_id or not tenant_id:
|
||||
logger.warning("Invalid document ingestion event", data=data)
|
||||
return
|
||||
if not doc_id or not tenant_id:
|
||||
logger.warning("Invalid document ingestion event", data=data)
|
||||
return
|
||||
ds = document_storage
|
||||
if ds is None:
|
||||
logger.error("Document storage not initialized")
|
||||
return
|
||||
|
||||
# Auto-process PDF documents
|
||||
if data.get("content_type") == "application/pdf":
|
||||
logger.info("Auto-processing ingested document", doc_id=doc_id)
|
||||
# Auto-process PDF documents
|
||||
if data.get("content_type") == "application/pdf":
|
||||
logger.info("Auto-processing ingested document", doc_id=doc_id)
|
||||
|
||||
try:
|
||||
# Get document content
|
||||
doc_content = await document_storage.get_document(tenant_id, doc_id)
|
||||
doc_content = await ds.get_document(tenant_id, doc_id)
|
||||
if doc_content:
|
||||
await _process_document_async(
|
||||
doc_id=doc_id,
|
||||
@@ -225,9 +255,10 @@ async def _handle_document_ingested(topic: str, payload: EventPayload) -> None:
|
||||
processing_id=str(ulid.new()),
|
||||
actor=payload.actor,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Failed to handle document ingestion", error=str(e))
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
"Failed to handle document ingestion", doc_id=doc_id, error=str(e)
|
||||
)
|
||||
|
||||
|
||||
async def _process_document_async(
|
||||
@@ -250,8 +281,8 @@ async def _process_document_async(
|
||||
images = await _pdf_to_images(content)
|
||||
|
||||
# Process each page
|
||||
pages_data: list[Any] = []
|
||||
for page_num, image in enumerate(images, 1):
|
||||
pages_data: list[dict[str, Any]] = []
|
||||
for page_num, image in enumerate(images, 0):
|
||||
page_data = await _process_page(image, page_num, strategy)
|
||||
pages_data.append(page_data)
|
||||
|
||||
@@ -270,7 +301,10 @@ async def _process_document_async(
|
||||
}
|
||||
|
||||
# Store results
|
||||
await document_storage.store_ocr_result(tenant_id, doc_id, ocr_results)
|
||||
ds = document_storage
|
||||
if ds is None:
|
||||
raise RuntimeError("Document storage not initialized")
|
||||
await ds.store_ocr_result(tenant_id, doc_id, ocr_results)
|
||||
|
||||
# Update metrics
|
||||
metrics.counter("documents_processed_total").labels(
|
||||
@@ -282,7 +316,7 @@ async def _process_document_async(
|
||||
).observe(
|
||||
datetime.utcnow().timestamp()
|
||||
- datetime.fromisoformat(
|
||||
ocr_results["processed_at"].replace("Z", "")
|
||||
ocr_results["processed_at"].replace("Z", "") # type: ignore
|
||||
).timestamp()
|
||||
)
|
||||
|
||||
@@ -300,7 +334,9 @@ async def _process_document_async(
|
||||
tenant_id=tenant_id,
|
||||
)
|
||||
|
||||
await event_bus.publish(EventTopics.DOC_OCR_READY, event_payload)
|
||||
eb = event_bus
|
||||
if eb is not None:
|
||||
await eb.publish(EventTopics.DOC_OCR_READY, event_payload)
|
||||
|
||||
logger.info(
|
||||
"OCR processing completed", doc_id=doc_id, pages=len(pages_data)
|
||||
@@ -316,58 +352,91 @@ async def _process_document_async(
|
||||
|
||||
|
||||
async def _pdf_to_images(pdf_content: bytes) -> list[bytes]:
|
||||
"""Convert PDF to images"""
|
||||
"""Convert PDF to page images without PyMuPDF.
|
||||
|
||||
Primary: pdf2image (requires poppler). Fallback: extract largest embedded image per page via PyPDF2/Pillow.
|
||||
"""
|
||||
# First try pdf2image for full-page rasterization
|
||||
try:
|
||||
import fitz # PyMuPDF
|
||||
|
||||
# Open PDF
|
||||
pdf_doc = fitz.open(stream=pdf_content, filetype="pdf")
|
||||
|
||||
images: list[Any] = []
|
||||
for page_num in range(min(len(pdf_doc), settings.max_pages)):
|
||||
page = pdf_doc[page_num]
|
||||
|
||||
# Render page to image
|
||||
mat = fitz.Matrix(2.0, 2.0) # 2x zoom for better OCR
|
||||
pix = page.get_pixmap(matrix=mat)
|
||||
img_data = pix.tobytes("png")
|
||||
|
||||
images.append(img_data)
|
||||
|
||||
pdf_doc.close()
|
||||
return images
|
||||
|
||||
except ImportError:
|
||||
logger.error("PyMuPDF not available, using fallback")
|
||||
return await _pdf_to_images_fallback(pdf_content)
|
||||
except Exception as e:
|
||||
logger.error("PDF conversion failed", error=str(e))
|
||||
raise
|
||||
|
||||
|
||||
async def _pdf_to_images_fallback(pdf_content: bytes) -> list[bytes]:
|
||||
"""Fallback PDF to images conversion"""
|
||||
try:
|
||||
from pdf2image import convert_from_bytes
|
||||
|
||||
images = convert_from_bytes(
|
||||
pdf_content, dpi=200, first_page=1, last_page=settings.max_pages
|
||||
)
|
||||
|
||||
# Convert PIL images to bytes
|
||||
image_bytes: list[Any] = []
|
||||
image_bytes: list[bytes] = []
|
||||
for img in images:
|
||||
import io
|
||||
|
||||
img_buffer = io.BytesIO()
|
||||
img.save(img_buffer, format="PNG")
|
||||
image_bytes.append(img_buffer.getvalue())
|
||||
|
||||
return image_bytes
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
"pdf2image conversion failed; falling back to PyPDF2", error=str(e)
|
||||
)
|
||||
|
||||
except ImportError:
|
||||
logger.error("pdf2image not available")
|
||||
raise Exception("No PDF conversion library available")
|
||||
# Fallback: extract largest embedded image per page using PyPDF2
|
||||
try:
|
||||
reader = PdfReader(io.BytesIO(pdf_content))
|
||||
out_images: list[bytes] = []
|
||||
for page_index, page in enumerate(reader.pages):
|
||||
if page_index >= settings.max_pages:
|
||||
break
|
||||
try:
|
||||
resources = page.get("/Resources")
|
||||
if resources is None:
|
||||
continue
|
||||
xobject = resources.get("/XObject")
|
||||
if xobject is None:
|
||||
continue
|
||||
xobject = xobject.get_object()
|
||||
|
||||
largest = None
|
||||
largest_area = -1
|
||||
for _, obj_ref in xobject.items():
|
||||
try:
|
||||
obj = obj_ref.get_object()
|
||||
if obj.get("/Subtype") != "/Image":
|
||||
continue
|
||||
width = int(obj.get("/Width", 0))
|
||||
height = int(obj.get("/Height", 0))
|
||||
area = width * height
|
||||
if area > largest_area:
|
||||
largest = obj
|
||||
largest_area = area
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
if largest is None:
|
||||
continue
|
||||
|
||||
data = largest.get_data()
|
||||
filt = largest.get("/Filter")
|
||||
|
||||
if filt in ("/DCTDecode", "/JPXDecode"):
|
||||
# JPEG or JPEG2000
|
||||
out_images.append(data)
|
||||
else:
|
||||
# Flate or other; decode via Pillow
|
||||
mode = "RGB"
|
||||
colorspace = largest.get("/ColorSpace")
|
||||
if colorspace in ("/DeviceGray",):
|
||||
mode = "L"
|
||||
width = int(largest.get("/Width", 0))
|
||||
height = int(largest.get("/Height", 0))
|
||||
try:
|
||||
img = Image.frombytes(mode, (width, height), data)
|
||||
except Exception:
|
||||
img = Image.open(io.BytesIO(data))
|
||||
buf = io.BytesIO()
|
||||
img.save(buf, format="PNG")
|
||||
out_images.append(buf.getvalue())
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
if not out_images:
|
||||
raise RuntimeError("No images extracted via PyPDF2 fallback")
|
||||
return out_images
|
||||
except Exception as fallback_e:
|
||||
logger.error("PDF conversion failed (both methods)", error=str(fallback_e))
|
||||
raise
|
||||
|
||||
|
||||
async def _process_page(
|
||||
@@ -395,6 +464,8 @@ async def _process_page(
|
||||
layoutlm_result.get("confidence", 0),
|
||||
),
|
||||
}
|
||||
elif strategy == "vision":
|
||||
return await _process_with_vision(image_data, page_num)
|
||||
else:
|
||||
raise ValueError(f"Unknown strategy: {strategy}")
|
||||
|
||||
@@ -402,11 +473,6 @@ async def _process_page(
|
||||
async def _process_with_tesseract(image_data: bytes, page_num: int) -> dict[str, Any]:
|
||||
"""Process page with Tesseract OCR"""
|
||||
try:
|
||||
import io
|
||||
|
||||
import pytesseract
|
||||
from PIL import Image
|
||||
|
||||
# Load image
|
||||
image = Image.open(io.BytesIO(image_data))
|
||||
|
||||
@@ -414,13 +480,13 @@ async def _process_with_tesseract(image_data: bytes, page_num: int) -> dict[str,
|
||||
config = f"{settings.tesseract_config} -l {settings.languages}"
|
||||
|
||||
# Extract text with confidence
|
||||
data = pytesseract.image_to_data(
|
||||
data = pytesseract.image_to_data( # type: ignore
|
||||
image, config=config, output_type=pytesseract.Output.DICT
|
||||
)
|
||||
|
||||
# Process results
|
||||
words: list[Any] = []
|
||||
confidences: list[Any] = []
|
||||
words: list[dict[str, Any]] = []
|
||||
confidences: list[float] = []
|
||||
|
||||
for i in range(len(data["text"])):
|
||||
if int(data["conf"][i]) > 0: # Valid confidence
|
||||
@@ -449,13 +515,6 @@ async def _process_with_tesseract(image_data: bytes, page_num: int) -> dict[str,
|
||||
"word_count": len(words),
|
||||
}
|
||||
|
||||
except ImportError:
|
||||
logger.error("pytesseract not available")
|
||||
return {
|
||||
"page": page_num,
|
||||
"strategy": "tesseract",
|
||||
"error": "pytesseract not available",
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error("Tesseract processing failed", page=page_num, error=str(e))
|
||||
return {"page": page_num, "strategy": "tesseract", "error": str(e)}
|
||||
@@ -482,6 +541,68 @@ async def _process_with_layoutlm(image_data: bytes, page_num: int) -> dict[str,
|
||||
return {"page": page_num, "strategy": "layoutlm", "error": str(e)}
|
||||
|
||||
|
||||
async def _process_with_vision(image_data: bytes, page_num: int) -> dict[str, Any]:
|
||||
"""Process page with LLM vision OCR via shared OCRProcessor"""
|
||||
try:
|
||||
vp = vision_processor
|
||||
if vp is None:
|
||||
raise RuntimeError("Vision OCR processor not initialized")
|
||||
|
||||
# Persist the page image temporarily for the processor API
|
||||
import tempfile
|
||||
|
||||
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
|
||||
tmp.write(image_data)
|
||||
tmp_path = tmp.name
|
||||
|
||||
try:
|
||||
text = vp.process_image(
|
||||
image_path=tmp_path,
|
||||
format_type=settings.vision_format,
|
||||
preprocess=settings.vision_preprocess,
|
||||
language=settings.languages,
|
||||
)
|
||||
finally:
|
||||
try:
|
||||
os.remove(tmp_path)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
return {
|
||||
"page": page_num,
|
||||
"strategy": "vision",
|
||||
"text": text if isinstance(text, str) else str(text),
|
||||
"confidence": 0.0, # Not provided by LLM API
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error("Vision processing failed", page=page_num, error=str(e))
|
||||
return {"page": page_num, "strategy": "vision", "error": str(e)}
|
||||
|
||||
|
||||
def _schedule_process_document_async(
|
||||
doc_id: str,
|
||||
tenant_id: str,
|
||||
content: bytes,
|
||||
strategy: str,
|
||||
processing_id: str,
|
||||
actor: str,
|
||||
) -> None:
|
||||
"""Sync wrapper to schedule the async OCR task.
|
||||
|
||||
This keeps FastAPI BackgroundTasks type expectations satisfied under mypy strict.
|
||||
"""
|
||||
asyncio.create_task(
|
||||
_process_document_async(
|
||||
doc_id=doc_id,
|
||||
tenant_id=tenant_id,
|
||||
content=content,
|
||||
strategy=strategy,
|
||||
processing_id=processing_id,
|
||||
actor=actor,
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
@app.exception_handler(HTTPException)
|
||||
async def http_exception_handler(request: Request, exc: HTTPException) -> JSONResponse:
|
||||
"""Handle HTTP exceptions with RFC7807 format"""
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
pytesseract>=0.3.13
|
||||
|
||||
# PDF processing
|
||||
PyMuPDF>=1.26.4
|
||||
PyPDF2>=3.0.1
|
||||
pdf2image>=1.17.0
|
||||
|
||||
# Image processing
|
||||
|
||||
Reference in New Issue
Block a user