Initial commit
Some checks failed
CI/CD Pipeline / Code Quality & Linting (push) Has been cancelled
CI/CD Pipeline / Policy Validation (push) Has been cancelled
CI/CD Pipeline / Test Suite (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-firm-connectors) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-forms) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-hmrc) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ingestion) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-normalize-map) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ocr) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-indexer) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-reason) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rpa) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (ui-review) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (ui-review) (push) Has been cancelled
CI/CD Pipeline / Generate SBOM (push) Has been cancelled
CI/CD Pipeline / Deploy to Staging (push) Has been cancelled
CI/CD Pipeline / Deploy to Production (push) Has been cancelled
CI/CD Pipeline / Notifications (push) Has been cancelled
Some checks failed
CI/CD Pipeline / Code Quality & Linting (push) Has been cancelled
CI/CD Pipeline / Policy Validation (push) Has been cancelled
CI/CD Pipeline / Test Suite (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-firm-connectors) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-forms) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-hmrc) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ingestion) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-normalize-map) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ocr) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-indexer) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-reason) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rpa) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (ui-review) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (ui-review) (push) Has been cancelled
CI/CD Pipeline / Generate SBOM (push) Has been cancelled
CI/CD Pipeline / Deploy to Staging (push) Has been cancelled
CI/CD Pipeline / Deploy to Production (push) Has been cancelled
CI/CD Pipeline / Notifications (push) Has been cancelled
This commit is contained in:
145
libs/storage/document.py
Normal file
145
libs/storage/document.py
Normal file
@@ -0,0 +1,145 @@
|
||||
"""High-level document storage operations."""
|
||||
|
||||
import hashlib
|
||||
import json
|
||||
from io import BytesIO
|
||||
from typing import Any
|
||||
|
||||
import structlog
|
||||
|
||||
from .client import StorageClient
|
||||
|
||||
logger = structlog.get_logger()
|
||||
|
||||
|
||||
class DocumentStorage:
|
||||
"""High-level document storage operations"""
|
||||
|
||||
def __init__(self, storage_client: StorageClient):
|
||||
self.storage = storage_client
|
||||
|
||||
async def store_document( # pylint: disable=too-many-arguments,too-many-positional-arguments
|
||||
self,
|
||||
tenant_id: str,
|
||||
doc_id: str,
|
||||
content: bytes,
|
||||
content_type: str = "application/pdf",
|
||||
metadata: dict[str, str] | None = None,
|
||||
bucket_name: str = "raw-documents",
|
||||
) -> dict[str, Any]:
|
||||
"""Store document with metadata"""
|
||||
|
||||
# Calculate checksum
|
||||
checksum = hashlib.sha256(content).hexdigest()
|
||||
|
||||
# Prepare metadata
|
||||
doc_metadata = {
|
||||
"tenant_id": tenant_id,
|
||||
"doc_id": doc_id,
|
||||
"checksum": checksum,
|
||||
"size": str(len(content)),
|
||||
**(metadata or {}),
|
||||
}
|
||||
|
||||
# Determine bucket and key
|
||||
object_key = f"tenants/{tenant_id}/raw/{doc_id}.pdf"
|
||||
|
||||
# Upload to storage
|
||||
success = await self.storage.put_object(
|
||||
bucket_name=bucket_name,
|
||||
object_name=object_key,
|
||||
data=BytesIO(content),
|
||||
length=len(content),
|
||||
content_type=content_type,
|
||||
metadata=doc_metadata,
|
||||
)
|
||||
|
||||
if success:
|
||||
return {
|
||||
"bucket": bucket_name,
|
||||
"key": object_key,
|
||||
"checksum": checksum,
|
||||
"size": len(content),
|
||||
"s3_url": f"s3://{bucket_name}/{object_key}",
|
||||
}
|
||||
|
||||
raise RuntimeError("Failed to store document")
|
||||
|
||||
async def store_ocr_result(
|
||||
self, tenant_id: str, doc_id: str, ocr_data: dict[str, Any]
|
||||
) -> str:
|
||||
"""Store OCR results as JSON"""
|
||||
bucket_name = "evidence"
|
||||
object_key = f"tenants/{tenant_id}/ocr/{doc_id}.json"
|
||||
|
||||
# Convert to JSON bytes
|
||||
json_data = json.dumps(ocr_data, indent=2).encode("utf-8")
|
||||
|
||||
# Upload to storage
|
||||
success = await self.storage.put_object(
|
||||
bucket_name=bucket_name,
|
||||
object_name=object_key,
|
||||
data=BytesIO(json_data),
|
||||
length=len(json_data),
|
||||
content_type="application/json",
|
||||
)
|
||||
|
||||
if success:
|
||||
return f"s3://{bucket_name}/{object_key}"
|
||||
|
||||
raise RuntimeError("Failed to store OCR result")
|
||||
|
||||
async def store_extraction_result(
|
||||
self, tenant_id: str, doc_id: str, extraction_data: dict[str, Any]
|
||||
) -> str:
|
||||
"""Store extraction results as JSON"""
|
||||
bucket_name = "evidence"
|
||||
object_key = f"tenants/{tenant_id}/extractions/{doc_id}.json"
|
||||
|
||||
# Convert to JSON bytes
|
||||
json_data = json.dumps(extraction_data, indent=2).encode("utf-8")
|
||||
|
||||
# Upload to storage
|
||||
success = await self.storage.put_object(
|
||||
bucket_name=bucket_name,
|
||||
object_name=object_key,
|
||||
data=BytesIO(json_data),
|
||||
length=len(json_data),
|
||||
content_type="application/json",
|
||||
)
|
||||
|
||||
if success:
|
||||
return f"s3://{bucket_name}/{object_key}"
|
||||
|
||||
raise RuntimeError("Failed to store extraction result")
|
||||
|
||||
async def get_document(self, tenant_id: str, doc_id: str) -> bytes | None:
|
||||
"""Retrieve document content"""
|
||||
bucket_name = "raw-documents"
|
||||
object_key = f"tenants/{tenant_id}/raw/{doc_id}.pdf"
|
||||
|
||||
return await self.storage.get_object(bucket_name, object_key)
|
||||
|
||||
async def get_ocr_result(
|
||||
self, tenant_id: str, doc_id: str
|
||||
) -> dict[str, Any] | None:
|
||||
"""Retrieve OCR results"""
|
||||
bucket_name = "evidence"
|
||||
object_key = f"tenants/{tenant_id}/ocr/{doc_id}.json"
|
||||
|
||||
data = await self.storage.get_object(bucket_name, object_key)
|
||||
if data:
|
||||
return json.loads(data.decode("utf-8")) # type: ignore
|
||||
return None
|
||||
|
||||
async def get_extraction_result(
|
||||
self, tenant_id: str, doc_id: str
|
||||
) -> dict[str, Any] | None:
|
||||
"""Retrieve extraction results"""
|
||||
bucket_name = "evidence"
|
||||
object_key = f"tenants/{tenant_id}/extractions/{doc_id}.json"
|
||||
|
||||
data = await self.storage.get_object(bucket_name, object_key)
|
||||
if data:
|
||||
return json.loads(data.decode("utf-8")) # type: ignore
|
||||
return None
|
||||
Reference in New Issue
Block a user