full ingestion -> OCR -> extraction flow is now working correctly.
Some checks failed
CI/CD Pipeline / Build Docker Images (svc-rpa) (push) Has been cancelled
CI/CD Pipeline / Code Quality & Linting (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Policy Validation (push) Has been cancelled
CI/CD Pipeline / Test Suite (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-firm-connectors) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-forms) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-hmrc) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ingestion) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-normalize-map) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ocr) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-indexer) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-reason) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (ui-review) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (ui-review) (push) Has been cancelled
CI/CD Pipeline / Generate SBOM (push) Has been cancelled
CI/CD Pipeline / Deploy to Staging (push) Has been cancelled
CI/CD Pipeline / Deploy to Production (push) Has been cancelled
CI/CD Pipeline / Notifications (push) Has been cancelled
Some checks failed
CI/CD Pipeline / Build Docker Images (svc-rpa) (push) Has been cancelled
CI/CD Pipeline / Code Quality & Linting (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Policy Validation (push) Has been cancelled
CI/CD Pipeline / Test Suite (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-firm-connectors) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-forms) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-hmrc) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ingestion) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-normalize-map) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ocr) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-indexer) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-reason) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (ui-review) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (ui-review) (push) Has been cancelled
CI/CD Pipeline / Generate SBOM (push) Has been cancelled
CI/CD Pipeline / Deploy to Staging (push) Has been cancelled
CI/CD Pipeline / Deploy to Production (push) Has been cancelled
CI/CD Pipeline / Notifications (push) Has been cancelled
This commit is contained in:
@@ -64,28 +64,6 @@ Return a JSON object with the extracted fields and confidence scores.
|
||||
"""
|
||||
|
||||
|
||||
# Create app and settings
|
||||
app, settings = create_app(
|
||||
service_name="svc-extract",
|
||||
title="Tax Agent Extraction Service",
|
||||
description="LLM-based field extraction service",
|
||||
settings_class=ExtractionSettings,
|
||||
)
|
||||
|
||||
# Add middleware
|
||||
middleware_factory = create_trusted_proxy_middleware(settings.internal_cidrs)
|
||||
app.add_middleware(middleware_factory)
|
||||
|
||||
# Global clients
|
||||
storage_client: StorageClient | None = None
|
||||
document_storage: DocumentStorage | None = None
|
||||
event_bus: EventBus | None = None
|
||||
confidence_calibrator: ConfidenceCalibrator | None = None
|
||||
tracer = get_tracer("svc-extract")
|
||||
metrics = get_metrics()
|
||||
|
||||
|
||||
@app.on_event("startup")
|
||||
async def startup_event() -> None:
|
||||
"""Initialize service dependencies"""
|
||||
global storage_client, document_storage, event_bus, confidence_calibrator
|
||||
@@ -116,7 +94,6 @@ async def startup_event() -> None:
|
||||
logger.info("Extraction service started successfully")
|
||||
|
||||
|
||||
@app.on_event("shutdown")
|
||||
async def shutdown_event() -> None:
|
||||
"""Cleanup service dependencies"""
|
||||
global event_bus
|
||||
@@ -129,6 +106,29 @@ async def shutdown_event() -> None:
|
||||
logger.info("Extraction service shutdown complete")
|
||||
|
||||
|
||||
# Create app and settings
|
||||
app, settings = create_app(
|
||||
service_name="svc-extract",
|
||||
title="Tax Agent Extraction Service",
|
||||
description="LLM-based field extraction service",
|
||||
settings_class=ExtractionSettings,
|
||||
startup_hooks=[startup_event],
|
||||
shutdown_hooks=[shutdown_event],
|
||||
)
|
||||
|
||||
# Add middleware
|
||||
middleware_factory = create_trusted_proxy_middleware(settings.internal_cidrs)
|
||||
app.add_middleware(middleware_factory)
|
||||
|
||||
# Global clients
|
||||
storage_client: StorageClient | None = None
|
||||
document_storage: DocumentStorage | None = None
|
||||
event_bus: EventBus | None = None
|
||||
confidence_calibrator: ConfidenceCalibrator | None = None
|
||||
tracer = get_tracer("svc-extract")
|
||||
metrics = get_metrics()
|
||||
|
||||
|
||||
@app.post("/extract/{doc_id}", response_model=ExtractionResponse)
|
||||
async def extract_fields(
|
||||
doc_id: str,
|
||||
@@ -334,13 +334,14 @@ async def _extract_fields_async(
|
||||
)
|
||||
|
||||
# Update metrics
|
||||
metrics.counter("extractions_completed_total").labels(
|
||||
tenant_id=tenant_id, strategy=strategy
|
||||
).inc()
|
||||
metrics.counter(
|
||||
"extract_extractions_completed_total",
|
||||
labelnames=["tenant_id", "strategy"],
|
||||
).labels(tenant_id=tenant_id, strategy=strategy).inc()
|
||||
|
||||
metrics.histogram("extraction_confidence").labels(
|
||||
strategy=strategy
|
||||
).observe(calibrated_confidence)
|
||||
metrics.histogram(
|
||||
"extract_extraction_confidence", labelnames=["strategy"]
|
||||
).labels(strategy=strategy).observe(calibrated_confidence)
|
||||
|
||||
# Publish completion event
|
||||
event_payload = EventPayload(
|
||||
@@ -371,7 +372,10 @@ async def _extract_fields_async(
|
||||
logger.error("Field extraction failed", doc_id=doc_id, error=str(e))
|
||||
|
||||
# Update error metrics
|
||||
metrics.counter("extraction_errors_total").labels(
|
||||
metrics.counter(
|
||||
"extract_extraction_errors_total",
|
||||
labelnames=["tenant_id", "strategy", "error_type"],
|
||||
).labels(
|
||||
tenant_id=tenant_id, strategy=strategy, error_type=type(e).__name__
|
||||
).inc()
|
||||
|
||||
|
||||
@@ -77,11 +77,20 @@ def init_dependencies(app_settings: IngestionSettings) -> None:
|
||||
|
||||
|
||||
# Create app and settings
|
||||
async def startup_event() -> None:
|
||||
"""Initialize service dependencies"""
|
||||
if event_bus is None:
|
||||
raise ValueError("Event bus not initialized")
|
||||
|
||||
await event_bus.start()
|
||||
|
||||
|
||||
app, _settings = create_app(
|
||||
service_name="svc-ingestion",
|
||||
title="Tax Agent Ingestion Service",
|
||||
description="Document upload and storage service",
|
||||
settings_class=IngestionSettings,
|
||||
startup_hooks=[startup_event],
|
||||
)
|
||||
|
||||
# Initialize dependencies immediately
|
||||
@@ -158,6 +167,7 @@ async def upload_document(
|
||||
event_payload = EventPayload(
|
||||
data={
|
||||
"doc_id": doc_id,
|
||||
"tenant_id": tenant_id,
|
||||
"filename": file.filename or "unknown",
|
||||
"kind": kind.value,
|
||||
"source": source,
|
||||
|
||||
@@ -21,8 +21,10 @@ RUN apt-get update && apt-get install -y \
|
||||
WORKDIR /app
|
||||
|
||||
# Copy service-specific requirements and install
|
||||
# Copy base requirements and service-specific requirements
|
||||
COPY libs/requirements-base.txt /tmp/libs-requirements.txt
|
||||
COPY apps/svc_ocr/requirements.txt /tmp/service-requirements.txt
|
||||
RUN pip install --no-cache-dir -r /tmp/service-requirements.txt
|
||||
RUN pip install --no-cache-dir -r /tmp/libs-requirements.txt -r /tmp/service-requirements.txt
|
||||
|
||||
# Copy application code
|
||||
COPY libs/ ./libs/
|
||||
|
||||
@@ -118,7 +118,7 @@ async def init_dependencies(app_settings: OCRSettings) -> None:
|
||||
if attempt == max_retries:
|
||||
raise HTTPException(
|
||||
status_code=500, detail="Failed to connect to NATS after retries"
|
||||
)
|
||||
) from e
|
||||
await asyncio.sleep(delay)
|
||||
delay *= 2 # exponential backoff
|
||||
|
||||
@@ -280,7 +280,7 @@ async def _handle_document_ingested(topic: str, payload: EventPayload) -> None:
|
||||
return
|
||||
|
||||
# Auto-process PDF documents
|
||||
if data.get("content_type") == "application/pdf":
|
||||
if data.get("mime_type") == "application/pdf":
|
||||
logger.info("Auto-processing ingested document", doc_id=doc_id)
|
||||
|
||||
try:
|
||||
@@ -347,13 +347,13 @@ async def _process_document_async(
|
||||
await ds.store_ocr_result(tenant_id, doc_id, ocr_results)
|
||||
|
||||
# Update metrics
|
||||
metrics.counter("documents_processed_total").labels(
|
||||
tenant_id=tenant_id, strategy=strategy
|
||||
).inc()
|
||||
metrics.counter(
|
||||
"ocr_documents_processed_total", labelnames=["tenant_id", "strategy"]
|
||||
).labels(tenant_id=tenant_id, strategy=strategy).inc()
|
||||
|
||||
metrics.histogram("processing_duration_seconds").labels(
|
||||
strategy=strategy
|
||||
).observe(
|
||||
metrics.histogram(
|
||||
"ocr_processing_duration_seconds", labelnames=["strategy"]
|
||||
).labels(strategy=strategy).observe(
|
||||
datetime.utcnow().timestamp()
|
||||
- datetime.fromisoformat(
|
||||
ocr_results["processed_at"].replace("Z", "") # type: ignore
|
||||
@@ -386,7 +386,10 @@ async def _process_document_async(
|
||||
logger.error("OCR processing failed", doc_id=doc_id, error=str(e))
|
||||
|
||||
# Update error metrics
|
||||
metrics.counter("processing_errors_total").labels(
|
||||
metrics.counter(
|
||||
"ocr_processing_errors_total",
|
||||
labelnames=["tenant_id", "strategy", "error_type"],
|
||||
).labels(
|
||||
tenant_id=tenant_id, strategy=strategy, error_type=type(e).__name__
|
||||
).inc()
|
||||
|
||||
|
||||
Reference in New Issue
Block a user