Initial commit
Some checks failed
CI/CD Pipeline / Code Quality & Linting (push) Has been cancelled
CI/CD Pipeline / Policy Validation (push) Has been cancelled
CI/CD Pipeline / Test Suite (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-firm-connectors) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-forms) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-hmrc) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ingestion) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-normalize-map) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ocr) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-indexer) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-reason) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rpa) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (ui-review) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (ui-review) (push) Has been cancelled
CI/CD Pipeline / Generate SBOM (push) Has been cancelled
CI/CD Pipeline / Deploy to Staging (push) Has been cancelled
CI/CD Pipeline / Deploy to Production (push) Has been cancelled
CI/CD Pipeline / Notifications (push) Has been cancelled
Some checks failed
CI/CD Pipeline / Code Quality & Linting (push) Has been cancelled
CI/CD Pipeline / Policy Validation (push) Has been cancelled
CI/CD Pipeline / Test Suite (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-firm-connectors) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-forms) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-hmrc) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ingestion) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-normalize-map) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ocr) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-indexer) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-reason) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rpa) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (ui-review) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (ui-review) (push) Has been cancelled
CI/CD Pipeline / Generate SBOM (push) Has been cancelled
CI/CD Pipeline / Deploy to Staging (push) Has been cancelled
CI/CD Pipeline / Deploy to Production (push) Has been cancelled
CI/CD Pipeline / Notifications (push) Has been cancelled
This commit is contained in:
351
apps/svc_ingestion/main.py
Normal file
351
apps/svc_ingestion/main.py
Normal file
@@ -0,0 +1,351 @@
|
||||
"""Document upload, storage, checksum validation, metadata extraction service."""
|
||||
|
||||
import hashlib
|
||||
import mimetypes
|
||||
import os
|
||||
|
||||
# Import shared libraries
|
||||
import sys
|
||||
from datetime import UTC, datetime
|
||||
from typing import Any, cast
|
||||
|
||||
import structlog
|
||||
import ulid
|
||||
from fastapi import Depends, File, HTTPException, Request, UploadFile
|
||||
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
|
||||
|
||||
from libs.app_factory import create_app, get_tenant_dependency, get_user_dependency
|
||||
from libs.config import BaseAppSettings, create_event_bus, create_minio_client
|
||||
from libs.events import EventBus, EventPayload, EventTopics
|
||||
from libs.observability import get_metrics, get_tracer
|
||||
from libs.schemas import DocumentKind, DocumentUploadResponse
|
||||
from libs.storage import DocumentStorage, StorageClient
|
||||
|
||||
logger = structlog.get_logger()
|
||||
|
||||
|
||||
class IngestionSettings(BaseAppSettings):
|
||||
"""Settings for ingestion service"""
|
||||
|
||||
service_name: str = "svc-ingestion"
|
||||
|
||||
# File upload limits
|
||||
max_file_size: int = 50 * 1024 * 1024 # 50MB
|
||||
allowed_mime_types: list[str] = [
|
||||
"application/pdf",
|
||||
"image/jpeg",
|
||||
"image/png",
|
||||
"image/tiff",
|
||||
"text/csv",
|
||||
"application/vnd.ms-excel",
|
||||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
||||
]
|
||||
|
||||
# Storage configuration
|
||||
raw_documents_bucket: str = "raw-documents"
|
||||
evidence_bucket: str = "evidence"
|
||||
|
||||
|
||||
# Global clients (will be initialized in startup)
|
||||
storage_client: StorageClient | None = None
|
||||
document_storage: DocumentStorage | None = None
|
||||
event_bus: EventBus | None = None
|
||||
|
||||
# Settings will be initialized after app creation
|
||||
settings: IngestionSettings
|
||||
|
||||
|
||||
def init_dependencies(app_settings: IngestionSettings) -> None:
|
||||
"""Initialize service dependencies"""
|
||||
global storage_client, document_storage, event_bus, settings
|
||||
|
||||
settings = app_settings
|
||||
logger.info(
|
||||
"Starting ingestion service",
|
||||
minio_endpoint=settings.minio_endpoint,
|
||||
minio_access_key=settings.minio_access_key,
|
||||
)
|
||||
|
||||
# Initialize clients
|
||||
minio_client = create_minio_client(settings)
|
||||
storage_client = StorageClient(minio_client)
|
||||
document_storage = DocumentStorage(storage_client)
|
||||
event_bus = create_event_bus(settings)
|
||||
|
||||
logger.info("Ingestion service started successfully")
|
||||
|
||||
|
||||
# Create app and settings
|
||||
app, _settings = create_app(
|
||||
service_name="svc-ingestion",
|
||||
title="Tax Agent Ingestion Service",
|
||||
description="Document upload and storage service",
|
||||
settings_class=IngestionSettings,
|
||||
)
|
||||
|
||||
# Initialize dependencies immediately
|
||||
init_dependencies(cast(IngestionSettings, _settings))
|
||||
|
||||
# Get observability components
|
||||
tracer = get_tracer("svc-ingestion")
|
||||
metrics = get_metrics("svc-ingestion")
|
||||
|
||||
|
||||
# Health endpoints are provided by app_factory
|
||||
|
||||
|
||||
@app.post("/upload", response_model=DocumentUploadResponse)
|
||||
async def upload_document(
|
||||
request: Request,
|
||||
file: UploadFile = File(...),
|
||||
kind: DocumentKind = DocumentKind.INVOICE,
|
||||
source: str = "manual_upload",
|
||||
current_user: dict[str, Any] = Depends(get_user_dependency()),
|
||||
tenant_id: str = Depends(get_tenant_dependency()),
|
||||
) -> DocumentUploadResponse:
|
||||
"""Upload document for processing"""
|
||||
|
||||
# Check if services are initialized
|
||||
if document_storage is None or event_bus is None:
|
||||
raise HTTPException(
|
||||
status_code=503, detail="Service not ready - dependencies not initialized"
|
||||
)
|
||||
|
||||
with tracer.start_as_current_span("upload_document") as span:
|
||||
span.set_attribute("tenant_id", tenant_id)
|
||||
span.set_attribute("document_kind", kind.value)
|
||||
span.set_attribute("source", source)
|
||||
|
||||
try:
|
||||
# Validate file
|
||||
await _validate_upload(file)
|
||||
|
||||
# Generate document ID
|
||||
doc_id = f"doc_{ulid.new()}"
|
||||
span.set_attribute("doc_id", doc_id)
|
||||
|
||||
# Read file content
|
||||
content = await file.read()
|
||||
|
||||
# Calculate checksum
|
||||
checksum = hashlib.sha256(content).hexdigest()
|
||||
|
||||
# Detect MIME type
|
||||
detected_mime = None
|
||||
if file.filename:
|
||||
detected_mime = mimetypes.guess_type(file.filename)[0]
|
||||
content_type = (
|
||||
detected_mime or file.content_type or "application/octet-stream"
|
||||
)
|
||||
|
||||
# Store document
|
||||
storage_result = await document_storage.store_document(
|
||||
tenant_id=tenant_id,
|
||||
doc_id=doc_id,
|
||||
content=content,
|
||||
content_type=content_type,
|
||||
metadata={
|
||||
"original_filename": file.filename or "unknown",
|
||||
"kind": kind.value,
|
||||
"source": source,
|
||||
"uploaded_by": current_user.get("sub", "unknown"),
|
||||
"uploaded_at": datetime.now(UTC).isoformat(),
|
||||
},
|
||||
)
|
||||
|
||||
# Publish event
|
||||
event_payload = EventPayload(
|
||||
data={
|
||||
"doc_id": doc_id,
|
||||
"tenant_id": tenant_id,
|
||||
"kind": kind.value,
|
||||
"source": source,
|
||||
"checksum": checksum,
|
||||
"file_size": len(content),
|
||||
"content_type": content_type,
|
||||
"s3_url": storage_result["s3_url"],
|
||||
},
|
||||
actor=current_user.get("sub", "system"),
|
||||
tenant_id=tenant_id,
|
||||
trace_id=str(span.get_span_context().trace_id),
|
||||
)
|
||||
|
||||
await event_bus.publish(EventTopics.DOC_INGESTED, event_payload)
|
||||
|
||||
# Update metrics
|
||||
metrics.counter(
|
||||
"documents_uploaded_total", labelnames=["tenant_id", "kind", "source"]
|
||||
).labels(tenant_id=tenant_id, kind=kind.value, source=source).inc()
|
||||
|
||||
metrics.histogram(
|
||||
"document_size_bytes", labelnames=["tenant_id", "kind"]
|
||||
).labels(tenant_id=tenant_id, kind=kind.value).observe(len(content))
|
||||
|
||||
logger.info(
|
||||
"Document uploaded successfully",
|
||||
doc_id=doc_id,
|
||||
tenant_id=tenant_id,
|
||||
kind=kind.value,
|
||||
size=len(content),
|
||||
checksum=checksum,
|
||||
)
|
||||
|
||||
return DocumentUploadResponse(
|
||||
doc_id=doc_id, s3_url=storage_result["s3_url"], checksum=checksum
|
||||
)
|
||||
|
||||
except ValueError as e:
|
||||
logger.warning("Upload validation failed", error=str(e))
|
||||
# Track validation errors
|
||||
try:
|
||||
metrics.counter(
|
||||
"upload_errors_total", labelnames=["tenant_id", "error_type"]
|
||||
).labels(tenant_id=tenant_id, error_type="ValueError").inc()
|
||||
except Exception:
|
||||
pass # Don't fail on metrics errors
|
||||
raise HTTPException(status_code=400, detail=str(e))
|
||||
except Exception as e:
|
||||
logger.error("Upload failed", error=str(e))
|
||||
# Track upload errors
|
||||
try:
|
||||
metrics.counter(
|
||||
"upload_errors_total", labelnames=["tenant_id", "error_type"]
|
||||
).labels(tenant_id=tenant_id, error_type=type(e).__name__).inc()
|
||||
except Exception:
|
||||
pass # Don't fail on metrics errors
|
||||
raise HTTPException(status_code=500, detail="Upload failed")
|
||||
|
||||
|
||||
@app.get("/documents/{doc_id}")
|
||||
async def get_document_info(
|
||||
doc_id: str,
|
||||
current_user: dict[str, Any] = Depends(get_user_dependency()),
|
||||
tenant_id: str = Depends(get_tenant_dependency()),
|
||||
) -> dict[str, str]:
|
||||
"""Get document information"""
|
||||
|
||||
# Check if services are initialized
|
||||
if storage_client is None:
|
||||
raise HTTPException(
|
||||
status_code=503, detail="Service not ready - dependencies not initialized"
|
||||
)
|
||||
|
||||
with tracer.start_as_current_span("get_document_info") as span:
|
||||
span.set_attribute("doc_id", doc_id)
|
||||
span.set_attribute("tenant_id", tenant_id)
|
||||
|
||||
try:
|
||||
# Check if document exists
|
||||
ingestion_settings = cast(IngestionSettings, settings)
|
||||
bucket_name = ingestion_settings.raw_documents_bucket
|
||||
object_key = f"tenants/{tenant_id}/raw/{doc_id}.pdf"
|
||||
|
||||
exists = await storage_client.object_exists(bucket_name, object_key)
|
||||
|
||||
if not exists:
|
||||
raise HTTPException(status_code=404, detail="Document not found")
|
||||
|
||||
# Get presigned URL for download
|
||||
download_url = await storage_client.get_presigned_url(
|
||||
bucket_name=bucket_name, object_name=object_key, method="GET"
|
||||
)
|
||||
|
||||
if not download_url:
|
||||
raise HTTPException(
|
||||
status_code=500, detail="Failed to generate download URL"
|
||||
)
|
||||
|
||||
return {
|
||||
"doc_id": doc_id,
|
||||
"download_url": download_url,
|
||||
"s3_url": f"s3://{bucket_name}/{object_key}",
|
||||
}
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error("Failed to get document info", doc_id=doc_id, error=str(e))
|
||||
raise HTTPException(status_code=500, detail="Failed to get document info")
|
||||
|
||||
|
||||
@app.delete("/documents/{doc_id}")
|
||||
async def delete_document(
|
||||
doc_id: str,
|
||||
current_user: dict[str, Any] = Depends(get_user_dependency()),
|
||||
tenant_id: str = Depends(get_tenant_dependency()),
|
||||
) -> dict[str, str]:
|
||||
"""Delete document"""
|
||||
|
||||
# Check if services are initialized
|
||||
if storage_client is None:
|
||||
raise HTTPException(
|
||||
status_code=503, detail="Service not ready - dependencies not initialized"
|
||||
)
|
||||
|
||||
with tracer.start_as_current_span("delete_document") as span:
|
||||
span.set_attribute("doc_id", doc_id)
|
||||
span.set_attribute("tenant_id", tenant_id)
|
||||
|
||||
try:
|
||||
# Delete from storage
|
||||
ingestion_settings = cast(IngestionSettings, settings)
|
||||
bucket_name = ingestion_settings.raw_documents_bucket
|
||||
object_key = f"tenants/{tenant_id}/raw/{doc_id}.pdf"
|
||||
|
||||
success = await storage_client.delete_object(bucket_name, object_key)
|
||||
|
||||
if not success:
|
||||
raise HTTPException(status_code=404, detail="Document not found")
|
||||
|
||||
logger.info("Document deleted", doc_id=doc_id, tenant_id=tenant_id)
|
||||
|
||||
return {"message": "Document deleted successfully"}
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error("Failed to delete document", doc_id=doc_id, error=str(e))
|
||||
raise HTTPException(status_code=500, detail="Failed to delete document")
|
||||
|
||||
|
||||
async def _validate_upload(file: UploadFile) -> None:
|
||||
"""Validate uploaded file"""
|
||||
|
||||
# Cast settings to the correct type
|
||||
ingestion_settings = cast(IngestionSettings, settings)
|
||||
|
||||
# Check file size
|
||||
if file.size and file.size > ingestion_settings.max_file_size:
|
||||
raise ValueError(
|
||||
f"File too large: {file.size} bytes (max: {ingestion_settings.max_file_size})"
|
||||
)
|
||||
|
||||
# Check MIME type
|
||||
if file.content_type not in ingestion_settings.allowed_mime_types:
|
||||
# Try to detect MIME type from filename
|
||||
detected_mime = None
|
||||
if file.filename:
|
||||
detected_mime = mimetypes.guess_type(file.filename)[0]
|
||||
if detected_mime not in ingestion_settings.allowed_mime_types:
|
||||
raise ValueError(f"Unsupported file type: {file.content_type}")
|
||||
|
||||
# Check filename
|
||||
if not file.filename:
|
||||
raise ValueError("Filename is required")
|
||||
|
||||
# Check for malicious filenames
|
||||
if ".." in file.filename or "/" in file.filename or "\\" in file.filename:
|
||||
raise ValueError("Invalid filename")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import uvicorn
|
||||
|
||||
uvicorn.run(
|
||||
"main:app",
|
||||
host="0.0.0.0",
|
||||
port=8000,
|
||||
reload=True,
|
||||
log_config=None, # Use structlog configuration
|
||||
)
|
||||
Reference in New Issue
Block a user