Initial commit
Some checks failed
CI/CD Pipeline / Code Quality & Linting (push) Has been cancelled
CI/CD Pipeline / Policy Validation (push) Has been cancelled
CI/CD Pipeline / Test Suite (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-firm-connectors) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-forms) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-hmrc) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ingestion) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-normalize-map) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ocr) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-indexer) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-reason) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rpa) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (ui-review) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (ui-review) (push) Has been cancelled
CI/CD Pipeline / Generate SBOM (push) Has been cancelled
CI/CD Pipeline / Deploy to Staging (push) Has been cancelled
CI/CD Pipeline / Deploy to Production (push) Has been cancelled
CI/CD Pipeline / Notifications (push) Has been cancelled
Some checks failed
CI/CD Pipeline / Code Quality & Linting (push) Has been cancelled
CI/CD Pipeline / Policy Validation (push) Has been cancelled
CI/CD Pipeline / Test Suite (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-firm-connectors) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-forms) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-hmrc) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ingestion) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-normalize-map) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ocr) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-indexer) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-reason) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rpa) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (ui-review) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (ui-review) (push) Has been cancelled
CI/CD Pipeline / Generate SBOM (push) Has been cancelled
CI/CD Pipeline / Deploy to Staging (push) Has been cancelled
CI/CD Pipeline / Deploy to Production (push) Has been cancelled
CI/CD Pipeline / Notifications (push) Has been cancelled
This commit is contained in:
43
apps/svc_ocr/Dockerfile
Normal file
43
apps/svc_ocr/Dockerfile
Normal file
@@ -0,0 +1,43 @@
|
||||
# Dockerfile for svc_ocr - Uses base-ml image
|
||||
# Base image contains: FastAPI, database drivers, transformers, PyTorch, numpy, etc.
|
||||
# This Dockerfile adds OCR-specific dependencies and application code
|
||||
|
||||
ARG REGISTRY=gitea.harkon.co.uk
|
||||
ARG OWNER=harkon
|
||||
ARG BASE_VERSION=v1.0.1
|
||||
FROM ${REGISTRY}/${OWNER}/base-ml:${BASE_VERSION}
|
||||
|
||||
# Switch to root to install system and service-specific dependencies
|
||||
USER root
|
||||
|
||||
# Install OCR runtime dependencies (Tesseract, poppler)
|
||||
RUN apt-get update && apt-get install -y \
|
||||
tesseract-ocr \
|
||||
tesseract-ocr-eng \
|
||||
poppler-utils \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Set working directory
|
||||
WORKDIR /app
|
||||
|
||||
# Copy service-specific requirements and install
|
||||
COPY apps/svc_ocr/requirements.txt /tmp/service-requirements.txt
|
||||
RUN pip install --no-cache-dir -r /tmp/service-requirements.txt
|
||||
|
||||
# Copy application code
|
||||
COPY libs/ ./libs/
|
||||
COPY apps/svc_ocr/ ./apps/svc_ocr/
|
||||
|
||||
# Set permissions and switch to non-root user
|
||||
RUN chown -R appuser:appuser /app
|
||||
USER appuser
|
||||
|
||||
# Health check
|
||||
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
|
||||
CMD curl -f http://localhost:8000/healthz || exit 1
|
||||
|
||||
# Expose port
|
||||
EXPOSE 8000
|
||||
|
||||
# Run the application
|
||||
CMD ["python", "-m", "uvicorn", "apps.svc_ocr.main:app", "--host", "0.0.0.0", "--port", "8000"]
|
||||
504
apps/svc_ocr/main.py
Normal file
504
apps/svc_ocr/main.py
Normal file
@@ -0,0 +1,504 @@
|
||||
# FILE: apps/svc-ocr/main.py
|
||||
# OCR and layout extraction using Tesseract, LayoutLM, and document AI
|
||||
|
||||
import os
|
||||
|
||||
# Import shared libraries
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from typing import Any
|
||||
|
||||
import structlog
|
||||
import ulid
|
||||
from fastapi import BackgroundTasks, Depends, HTTPException, Request
|
||||
from fastapi.responses import JSONResponse
|
||||
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
|
||||
|
||||
from libs.app_factory import create_app
|
||||
from libs.config import BaseAppSettings, create_event_bus, create_minio_client
|
||||
from libs.events import EventBus, EventPayload, EventTopics
|
||||
from libs.observability import get_metrics, get_tracer, setup_observability
|
||||
from libs.schemas import ErrorResponse
|
||||
from libs.security import get_current_user, get_tenant_id
|
||||
from libs.storage import DocumentStorage, StorageClient
|
||||
|
||||
logger = structlog.get_logger()
|
||||
|
||||
|
||||
class OCRSettings(BaseAppSettings):
|
||||
"""Settings for OCR service"""
|
||||
|
||||
service_name: str = "svc-ocr"
|
||||
|
||||
# OCR configuration
|
||||
tesseract_cmd: str = "/usr/bin/tesseract"
|
||||
tesseract_config: str = "--oem 3 --psm 6"
|
||||
languages: str = "eng"
|
||||
|
||||
# Layout analysis
|
||||
layoutlm_model: str = "microsoft/layoutlm-base-uncased"
|
||||
confidence_threshold: float = 0.7
|
||||
|
||||
# Processing limits
|
||||
max_pages: int = 50
|
||||
max_file_size: int = 100 * 1024 * 1024 # 100MB
|
||||
|
||||
# Output configuration
|
||||
include_coordinates: bool = True
|
||||
include_confidence: bool = True
|
||||
|
||||
|
||||
# Create app and settings
|
||||
app, settings = create_app(
|
||||
service_name="svc-ocr",
|
||||
title="Tax Agent OCR Service",
|
||||
description="OCR and layout extraction service",
|
||||
settings_class=OCRSettings,
|
||||
) # fmt: skip
|
||||
|
||||
# Global clients
|
||||
storage_client: StorageClient | None = None
|
||||
document_storage: DocumentStorage | None = None
|
||||
event_bus: EventBus | None = None
|
||||
tracer = get_tracer("svc-ocr")
|
||||
metrics = get_metrics()
|
||||
|
||||
|
||||
@app.on_event("startup")
|
||||
async def startup_event() -> None:
|
||||
"""Initialize service dependencies"""
|
||||
global storage_client, document_storage, event_bus
|
||||
|
||||
logger.info("Starting OCR service")
|
||||
|
||||
# Setup observability
|
||||
setup_observability(settings)
|
||||
|
||||
# Initialize MinIO client
|
||||
minio_client = create_minio_client(settings)
|
||||
storage_client = StorageClient(minio_client)
|
||||
document_storage = DocumentStorage(storage_client)
|
||||
|
||||
# Initialize event bus
|
||||
event_bus = create_event_bus(settings)
|
||||
if not event_bus:
|
||||
raise HTTPException(status_code=500, detail="Event bus not initialized")
|
||||
|
||||
await event_bus.start()
|
||||
|
||||
# Subscribe to document ingestion events
|
||||
await event_bus.subscribe(EventTopics.DOC_INGESTED, _handle_document_ingested)
|
||||
|
||||
logger.info("OCR service started successfully")
|
||||
|
||||
|
||||
@app.on_event("shutdown")
|
||||
async def shutdown_event() -> None:
|
||||
"""Cleanup service dependencies"""
|
||||
global event_bus
|
||||
|
||||
logger.info("Shutting down OCR service")
|
||||
|
||||
if event_bus:
|
||||
await event_bus.stop()
|
||||
|
||||
logger.info("OCR service shutdown complete")
|
||||
|
||||
|
||||
@app.get("/health")
|
||||
async def health_check() -> dict[str, Any]:
|
||||
"""Health check endpoint"""
|
||||
return {
|
||||
"status": "healthy",
|
||||
"service": settings.service_name,
|
||||
"version": settings.service_version,
|
||||
"timestamp": datetime.utcnow().isoformat(),
|
||||
}
|
||||
|
||||
|
||||
@app.post("/process/{doc_id}")
|
||||
async def process_document(
|
||||
doc_id: str,
|
||||
background_tasks: BackgroundTasks,
|
||||
strategy: str = "hybrid",
|
||||
current_user: dict[str, Any] = Depends(get_current_user),
|
||||
tenant_id: str = Depends(get_tenant_id),
|
||||
) -> dict[str, Any]:
|
||||
"""Process document with OCR"""
|
||||
|
||||
with tracer.start_as_current_span("process_document") as span:
|
||||
span.set_attribute("doc_id", doc_id)
|
||||
span.set_attribute("tenant_id", tenant_id)
|
||||
span.set_attribute("strategy", strategy)
|
||||
|
||||
try:
|
||||
# Check if document exists
|
||||
doc_content = await document_storage.get_document(tenant_id, doc_id)
|
||||
if not doc_content:
|
||||
raise HTTPException(status_code=404, detail="Document not found")
|
||||
|
||||
# Generate processing ID
|
||||
processing_id = str(ulid.new())
|
||||
span.set_attribute("processing_id", processing_id)
|
||||
|
||||
# Start background processing
|
||||
background_tasks.add_task(
|
||||
_process_document_async,
|
||||
doc_id,
|
||||
tenant_id,
|
||||
doc_content,
|
||||
strategy,
|
||||
processing_id,
|
||||
current_user.get("sub", "system"),
|
||||
)
|
||||
|
||||
logger.info(
|
||||
"OCR processing started", doc_id=doc_id, processing_id=processing_id
|
||||
)
|
||||
|
||||
return {
|
||||
"processing_id": processing_id,
|
||||
"doc_id": doc_id,
|
||||
"status": "processing",
|
||||
"strategy": strategy,
|
||||
}
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error("Failed to start OCR processing", doc_id=doc_id, error=str(e))
|
||||
raise HTTPException(status_code=500, detail="Failed to start processing")
|
||||
|
||||
|
||||
@app.get("/results/{doc_id}")
|
||||
async def get_ocr_results(
|
||||
doc_id: str,
|
||||
current_user: dict[str, Any] = Depends(get_current_user),
|
||||
tenant_id: str = Depends(get_tenant_id),
|
||||
) -> dict[str, Any]:
|
||||
"""Get OCR results for document"""
|
||||
|
||||
with tracer.start_as_current_span("get_ocr_results") as span:
|
||||
span.set_attribute("doc_id", doc_id)
|
||||
span.set_attribute("tenant_id", tenant_id)
|
||||
|
||||
try:
|
||||
# Get OCR results from storage
|
||||
ocr_results = await document_storage.get_ocr_result(tenant_id, doc_id)
|
||||
|
||||
if not ocr_results:
|
||||
raise HTTPException(status_code=404, detail="OCR results not found")
|
||||
|
||||
return ocr_results
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error("Failed to get OCR results", doc_id=doc_id, error=str(e))
|
||||
raise HTTPException(status_code=500, detail="Failed to get OCR results")
|
||||
|
||||
|
||||
async def _handle_document_ingested(topic: str, payload: EventPayload) -> None:
|
||||
"""Handle document ingestion events"""
|
||||
try:
|
||||
data = payload.data
|
||||
doc_id = data.get("doc_id")
|
||||
tenant_id = data.get("tenant_id")
|
||||
|
||||
if not doc_id or not tenant_id:
|
||||
logger.warning("Invalid document ingestion event", data=data)
|
||||
return
|
||||
|
||||
# Auto-process PDF documents
|
||||
if data.get("content_type") == "application/pdf":
|
||||
logger.info("Auto-processing ingested document", doc_id=doc_id)
|
||||
|
||||
# Get document content
|
||||
doc_content = await document_storage.get_document(tenant_id, doc_id)
|
||||
if doc_content:
|
||||
await _process_document_async(
|
||||
doc_id=doc_id,
|
||||
tenant_id=tenant_id,
|
||||
content=doc_content,
|
||||
strategy="hybrid",
|
||||
processing_id=str(ulid.new()),
|
||||
actor=payload.actor,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Failed to handle document ingestion", error=str(e))
|
||||
|
||||
|
||||
async def _process_document_async(
|
||||
doc_id: str,
|
||||
tenant_id: str,
|
||||
content: bytes,
|
||||
strategy: str,
|
||||
processing_id: str,
|
||||
actor: str,
|
||||
) -> None:
|
||||
"""Process document asynchronously"""
|
||||
|
||||
with tracer.start_as_current_span("process_document_async") as span:
|
||||
span.set_attribute("doc_id", doc_id)
|
||||
span.set_attribute("processing_id", processing_id)
|
||||
span.set_attribute("strategy", strategy)
|
||||
|
||||
try:
|
||||
# Convert PDF to images
|
||||
images = await _pdf_to_images(content)
|
||||
|
||||
# Process each page
|
||||
pages_data: list[Any] = []
|
||||
for page_num, image in enumerate(images, 1):
|
||||
page_data = await _process_page(image, page_num, strategy)
|
||||
pages_data.append(page_data)
|
||||
|
||||
# Combine results
|
||||
ocr_results = {
|
||||
"doc_id": doc_id,
|
||||
"processing_id": processing_id,
|
||||
"strategy": strategy,
|
||||
"processed_at": datetime.utcnow().isoformat(),
|
||||
"total_pages": len(pages_data),
|
||||
"pages": pages_data,
|
||||
"metadata": {
|
||||
"confidence_threshold": settings.confidence_threshold,
|
||||
"languages": settings.languages,
|
||||
},
|
||||
}
|
||||
|
||||
# Store results
|
||||
await document_storage.store_ocr_result(tenant_id, doc_id, ocr_results)
|
||||
|
||||
# Update metrics
|
||||
metrics.counter("documents_processed_total").labels(
|
||||
tenant_id=tenant_id, strategy=strategy
|
||||
).inc()
|
||||
|
||||
metrics.histogram("processing_duration_seconds").labels(
|
||||
strategy=strategy
|
||||
).observe(
|
||||
datetime.utcnow().timestamp()
|
||||
- datetime.fromisoformat(
|
||||
ocr_results["processed_at"].replace("Z", "")
|
||||
).timestamp()
|
||||
)
|
||||
|
||||
# Publish completion event
|
||||
event_payload = EventPayload(
|
||||
data={
|
||||
"doc_id": doc_id,
|
||||
"tenant_id": tenant_id,
|
||||
"processing_id": processing_id,
|
||||
"strategy": strategy,
|
||||
"total_pages": len(pages_data),
|
||||
"ocr_results": ocr_results,
|
||||
},
|
||||
actor=actor,
|
||||
tenant_id=tenant_id,
|
||||
)
|
||||
|
||||
await event_bus.publish(EventTopics.DOC_OCR_READY, event_payload)
|
||||
|
||||
logger.info(
|
||||
"OCR processing completed", doc_id=doc_id, pages=len(pages_data)
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error("OCR processing failed", doc_id=doc_id, error=str(e))
|
||||
|
||||
# Update error metrics
|
||||
metrics.counter("processing_errors_total").labels(
|
||||
tenant_id=tenant_id, strategy=strategy, error_type=type(e).__name__
|
||||
).inc()
|
||||
|
||||
|
||||
async def _pdf_to_images(pdf_content: bytes) -> list[bytes]:
|
||||
"""Convert PDF to images"""
|
||||
try:
|
||||
import fitz # PyMuPDF
|
||||
|
||||
# Open PDF
|
||||
pdf_doc = fitz.open(stream=pdf_content, filetype="pdf")
|
||||
|
||||
images: list[Any] = []
|
||||
for page_num in range(min(len(pdf_doc), settings.max_pages)):
|
||||
page = pdf_doc[page_num]
|
||||
|
||||
# Render page to image
|
||||
mat = fitz.Matrix(2.0, 2.0) # 2x zoom for better OCR
|
||||
pix = page.get_pixmap(matrix=mat)
|
||||
img_data = pix.tobytes("png")
|
||||
|
||||
images.append(img_data)
|
||||
|
||||
pdf_doc.close()
|
||||
return images
|
||||
|
||||
except ImportError:
|
||||
logger.error("PyMuPDF not available, using fallback")
|
||||
return await _pdf_to_images_fallback(pdf_content)
|
||||
except Exception as e:
|
||||
logger.error("PDF conversion failed", error=str(e))
|
||||
raise
|
||||
|
||||
|
||||
async def _pdf_to_images_fallback(pdf_content: bytes) -> list[bytes]:
|
||||
"""Fallback PDF to images conversion"""
|
||||
try:
|
||||
from pdf2image import convert_from_bytes
|
||||
|
||||
images = convert_from_bytes(
|
||||
pdf_content, dpi=200, first_page=1, last_page=settings.max_pages
|
||||
)
|
||||
|
||||
# Convert PIL images to bytes
|
||||
image_bytes: list[Any] = []
|
||||
for img in images:
|
||||
import io
|
||||
|
||||
img_buffer = io.BytesIO()
|
||||
img.save(img_buffer, format="PNG")
|
||||
image_bytes.append(img_buffer.getvalue())
|
||||
|
||||
return image_bytes
|
||||
|
||||
except ImportError:
|
||||
logger.error("pdf2image not available")
|
||||
raise Exception("No PDF conversion library available")
|
||||
|
||||
|
||||
async def _process_page(
|
||||
image_data: bytes, page_num: int, strategy: str
|
||||
) -> dict[str, Any]:
|
||||
"""Process single page with OCR"""
|
||||
|
||||
if strategy == "tesseract":
|
||||
return await _process_with_tesseract(image_data, page_num)
|
||||
elif strategy == "layoutlm":
|
||||
return await _process_with_layoutlm(image_data, page_num)
|
||||
elif strategy == "hybrid":
|
||||
# Combine both approaches
|
||||
tesseract_result = await _process_with_tesseract(image_data, page_num)
|
||||
layoutlm_result = await _process_with_layoutlm(image_data, page_num)
|
||||
|
||||
return {
|
||||
"page": page_num,
|
||||
"strategy": "hybrid",
|
||||
"tesseract": tesseract_result,
|
||||
"layoutlm": layoutlm_result,
|
||||
"text": tesseract_result.get("text", ""),
|
||||
"confidence": max(
|
||||
tesseract_result.get("confidence", 0),
|
||||
layoutlm_result.get("confidence", 0),
|
||||
),
|
||||
}
|
||||
else:
|
||||
raise ValueError(f"Unknown strategy: {strategy}")
|
||||
|
||||
|
||||
async def _process_with_tesseract(image_data: bytes, page_num: int) -> dict[str, Any]:
|
||||
"""Process page with Tesseract OCR"""
|
||||
try:
|
||||
import io
|
||||
|
||||
import pytesseract
|
||||
from PIL import Image
|
||||
|
||||
# Load image
|
||||
image = Image.open(io.BytesIO(image_data))
|
||||
|
||||
# Configure Tesseract
|
||||
config = f"{settings.tesseract_config} -l {settings.languages}"
|
||||
|
||||
# Extract text with confidence
|
||||
data = pytesseract.image_to_data(
|
||||
image, config=config, output_type=pytesseract.Output.DICT
|
||||
)
|
||||
|
||||
# Process results
|
||||
words: list[Any] = []
|
||||
confidences: list[Any] = []
|
||||
|
||||
for i in range(len(data["text"])):
|
||||
if int(data["conf"][i]) > 0: # Valid confidence
|
||||
word_data = {
|
||||
"text": data["text"][i],
|
||||
"confidence": int(data["conf"][i]) / 100.0,
|
||||
"bbox": [
|
||||
data["left"][i],
|
||||
data["top"][i],
|
||||
data["left"][i] + data["width"][i],
|
||||
data["top"][i] + data["height"][i],
|
||||
],
|
||||
}
|
||||
words.append(word_data)
|
||||
confidences.append(word_data["confidence"])
|
||||
|
||||
# Extract full text
|
||||
full_text = pytesseract.image_to_string(image, config=config)
|
||||
|
||||
return {
|
||||
"page": page_num,
|
||||
"strategy": "tesseract",
|
||||
"text": full_text.strip(),
|
||||
"words": words,
|
||||
"confidence": sum(confidences) / len(confidences) if confidences else 0.0,
|
||||
"word_count": len(words),
|
||||
}
|
||||
|
||||
except ImportError:
|
||||
logger.error("pytesseract not available")
|
||||
return {
|
||||
"page": page_num,
|
||||
"strategy": "tesseract",
|
||||
"error": "pytesseract not available",
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error("Tesseract processing failed", page=page_num, error=str(e))
|
||||
return {"page": page_num, "strategy": "tesseract", "error": str(e)}
|
||||
|
||||
|
||||
async def _process_with_layoutlm(image_data: bytes, page_num: int) -> dict[str, Any]:
|
||||
"""Process page with LayoutLM"""
|
||||
try:
|
||||
# This would integrate with LayoutLM model
|
||||
# For now, return placeholder
|
||||
logger.warning("LayoutLM processing not implemented")
|
||||
|
||||
return {
|
||||
"page": page_num,
|
||||
"strategy": "layoutlm",
|
||||
"text": "",
|
||||
"layout_elements": [],
|
||||
"confidence": 0.0,
|
||||
"error": "Not implemented",
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error("LayoutLM processing failed", page=page_num, error=str(e))
|
||||
return {"page": page_num, "strategy": "layoutlm", "error": str(e)}
|
||||
|
||||
|
||||
@app.exception_handler(HTTPException)
|
||||
async def http_exception_handler(request: Request, exc: HTTPException) -> JSONResponse:
|
||||
"""Handle HTTP exceptions with RFC7807 format"""
|
||||
return JSONResponse(
|
||||
status_code=exc.status_code,
|
||||
content=ErrorResponse(
|
||||
type=f"https://httpstatuses.com/{exc.status_code}",
|
||||
title=exc.detail,
|
||||
status=exc.status_code,
|
||||
detail=exc.detail,
|
||||
instance=str(request.url),
|
||||
trace_id="",
|
||||
).model_dump(),
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import uvicorn
|
||||
|
||||
uvicorn.run("main:app", host="0.0.0.0", port=8002, reload=True, log_config=None)
|
||||
16
apps/svc_ocr/requirements.txt
Normal file
16
apps/svc_ocr/requirements.txt
Normal file
@@ -0,0 +1,16 @@
|
||||
# Service-specific dependencies for svc_ocr
|
||||
# NOTE: ML dependencies (transformers, torch, numpy) are in base-ml image
|
||||
|
||||
# OCR engines (lightweight)
|
||||
pytesseract>=0.3.13
|
||||
|
||||
# PDF processing
|
||||
PyMuPDF>=1.26.4
|
||||
pdf2image>=1.17.0
|
||||
|
||||
# Image processing
|
||||
Pillow>=11.3.0
|
||||
opencv-python-headless>=4.12.0.88 # Headless version is smaller
|
||||
|
||||
# Computer vision (torchvision not in base-ml)
|
||||
torchvision>=0.23.0
|
||||
Reference in New Issue
Block a user