"""High-level document storage operations.""" import hashlib import json from io import BytesIO from typing import Any import structlog from .client import StorageClient logger = structlog.get_logger() class DocumentStorage: """High-level document storage operations""" def __init__(self, storage_client: StorageClient): self.storage = storage_client async def store_document( # pylint: disable=too-many-arguments,too-many-positional-arguments self, tenant_id: str, doc_id: str, content: bytes, content_type: str = "application/pdf", metadata: dict[str, str] | None = None, bucket_name: str = "raw-documents", ) -> dict[str, Any]: """Store document with metadata""" # Calculate checksum checksum = hashlib.sha256(content).hexdigest() # Prepare metadata doc_metadata = { "tenant_id": tenant_id, "doc_id": doc_id, "checksum": checksum, "size": str(len(content)), **(metadata or {}), } # Determine bucket and key object_key = f"tenants/{tenant_id}/raw/{doc_id}.pdf" # Upload to storage success = await self.storage.put_object( bucket_name=bucket_name, object_name=object_key, data=BytesIO(content), length=len(content), content_type=content_type, metadata=doc_metadata, ) if success: return { "bucket": bucket_name, "key": object_key, "checksum": checksum, "size": len(content), "s3_url": f"s3://{bucket_name}/{object_key}", } raise RuntimeError("Failed to store document") async def store_ocr_result( self, tenant_id: str, doc_id: str, ocr_data: dict[str, Any] ) -> str: """Store OCR results as JSON""" bucket_name = "evidence" object_key = f"tenants/{tenant_id}/ocr/{doc_id}.json" # Convert to JSON bytes json_data = json.dumps(ocr_data, indent=2).encode("utf-8") # Upload to storage success = await self.storage.put_object( bucket_name=bucket_name, object_name=object_key, data=BytesIO(json_data), length=len(json_data), content_type="application/json", ) if success: return f"s3://{bucket_name}/{object_key}" raise RuntimeError("Failed to store OCR result") async def store_extraction_result( self, tenant_id: str, doc_id: str, extraction_data: dict[str, Any] ) -> str: """Store extraction results as JSON""" bucket_name = "evidence" object_key = f"tenants/{tenant_id}/extractions/{doc_id}.json" # Convert to JSON bytes json_data = json.dumps(extraction_data, indent=2).encode("utf-8") # Upload to storage success = await self.storage.put_object( bucket_name=bucket_name, object_name=object_key, data=BytesIO(json_data), length=len(json_data), content_type="application/json", ) if success: return f"s3://{bucket_name}/{object_key}" raise RuntimeError("Failed to store extraction result") async def get_document(self, tenant_id: str, doc_id: str) -> bytes | None: """Retrieve document content""" bucket_name = "raw-documents" object_key = f"tenants/{tenant_id}/raw/{doc_id}.pdf" return await self.storage.get_object(bucket_name, object_key) async def get_ocr_result( self, tenant_id: str, doc_id: str ) -> dict[str, Any] | None: """Retrieve OCR results""" bucket_name = "evidence" object_key = f"tenants/{tenant_id}/ocr/{doc_id}.json" data = await self.storage.get_object(bucket_name, object_key) if data: return json.loads(data.decode("utf-8")) # type: ignore return None async def get_extraction_result( self, tenant_id: str, doc_id: str ) -> dict[str, Any] | None: """Retrieve extraction results""" bucket_name = "evidence" object_key = f"tenants/{tenant_id}/extractions/{doc_id}.json" data = await self.storage.get_object(bucket_name, object_key) if data: return json.loads(data.decode("utf-8")) # type: ignore return None