Initial commit
Some checks failed
CI/CD Pipeline / Code Quality & Linting (push) Has been cancelled
CI/CD Pipeline / Policy Validation (push) Has been cancelled
CI/CD Pipeline / Test Suite (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-firm-connectors) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-forms) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-hmrc) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ingestion) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-normalize-map) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ocr) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-indexer) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-reason) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rpa) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (ui-review) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (ui-review) (push) Has been cancelled
CI/CD Pipeline / Generate SBOM (push) Has been cancelled
CI/CD Pipeline / Deploy to Staging (push) Has been cancelled
CI/CD Pipeline / Deploy to Production (push) Has been cancelled
CI/CD Pipeline / Notifications (push) Has been cancelled

This commit is contained in:
harkon
2025-10-11 08:41:36 +01:00
commit b324ff09ef
276 changed files with 55220 additions and 0 deletions

9
libs/storage/__init__.py Normal file
View File

@@ -0,0 +1,9 @@
"""Storage client and document management for MinIO/S3."""
from .client import StorageClient
from .document import DocumentStorage
__all__ = [
"StorageClient",
"DocumentStorage",
]

231
libs/storage/client.py Normal file
View File

@@ -0,0 +1,231 @@
"""MinIO/S3 storage client wrapper."""
from datetime import timedelta
from typing import Any, BinaryIO
import structlog
from minio import Minio
from minio.error import S3Error
logger = structlog.get_logger()
class StorageClient:
"""MinIO/S3 storage client wrapper"""
def __init__(self, minio_client: Minio):
self.client = minio_client
async def ensure_bucket(self, bucket_name: str, region: str = "us-east-1") -> bool:
"""Ensure bucket exists, create if not"""
try:
# Check if bucket exists
if self.client.bucket_exists(bucket_name):
logger.debug("Bucket already exists", bucket=bucket_name)
return True
# Create bucket
self.client.make_bucket(bucket_name, location=region)
logger.info("Created bucket", bucket=bucket_name, region=region)
return True
except S3Error as e:
logger.error("Failed to ensure bucket", bucket=bucket_name, error=str(e))
return False
async def put_object( # pylint: disable=too-many-arguments,too-many-positional-arguments
self,
bucket_name: str,
object_name: str,
data: BinaryIO,
length: int,
content_type: str = "application/octet-stream",
metadata: dict[str, str] | None = None,
) -> bool:
"""Upload object to bucket"""
try:
# Ensure bucket exists
await self.ensure_bucket(bucket_name)
# Upload object
result = self.client.put_object(
bucket_name=bucket_name,
object_name=object_name,
data=data,
length=length,
content_type=content_type,
metadata=metadata or {}, # fmt: skip # pyright: ignore[reportArgumentType]
)
logger.info(
"Object uploaded",
bucket=bucket_name,
object=object_name,
etag=result.etag,
size=length,
)
return True
except S3Error as e:
logger.error(
"Failed to upload object",
bucket=bucket_name,
object=object_name,
error=str(e),
)
return False
async def get_object(self, bucket_name: str, object_name: str) -> bytes | None:
"""Download object from bucket"""
try:
response = self.client.get_object(bucket_name, object_name)
data = response.read()
response.close()
response.release_conn()
logger.debug(
"Object downloaded",
bucket=bucket_name,
object=object_name,
size=len(data),
)
return data # type: ignore
except S3Error as e:
logger.error(
"Failed to download object",
bucket=bucket_name,
object=object_name,
error=str(e),
)
return None
async def get_object_stream(self, bucket_name: str, object_name: str) -> Any:
"""Get object as stream"""
try:
response = self.client.get_object(bucket_name, object_name)
return response
except S3Error as e:
logger.error(
"Failed to get object stream",
bucket=bucket_name,
object=object_name,
error=str(e),
)
return None
async def object_exists(self, bucket_name: str, object_name: str) -> bool:
"""Check if object exists"""
try:
self.client.stat_object(bucket_name, object_name)
return True
except S3Error:
return False
async def delete_object(self, bucket_name: str, object_name: str) -> bool:
"""Delete object from bucket"""
try:
self.client.remove_object(bucket_name, object_name)
logger.info("Object deleted", bucket=bucket_name, object=object_name)
return True
except S3Error as e:
logger.error(
"Failed to delete object",
bucket=bucket_name,
object=object_name,
error=str(e),
)
return False
async def list_objects(
self, bucket_name: str, prefix: str | None = None, recursive: bool = True
) -> list[str]:
"""List objects in bucket"""
try:
objects = self.client.list_objects(
bucket_name, prefix=prefix, recursive=recursive
)
return [obj.object_name for obj in objects if obj.object_name is not None]
except S3Error as e:
logger.error(
"Failed to list objects",
bucket=bucket_name,
prefix=prefix,
error=str(e),
)
return []
async def get_presigned_url(
self,
bucket_name: str,
object_name: str,
expires: timedelta = timedelta(hours=1),
method: str = "GET",
) -> str | None:
"""Generate presigned URL for object access"""
try:
url = self.client.get_presigned_url(
method=method,
bucket_name=bucket_name,
object_name=object_name,
expires=expires,
)
logger.debug(
"Generated presigned URL",
bucket=bucket_name,
object=object_name,
method=method,
expires=expires,
)
return str(url)
except S3Error as e:
logger.error(
"Failed to generate presigned URL",
bucket=bucket_name,
object=object_name,
error=str(e),
)
return None
async def copy_object(
self, source_bucket: str, source_object: str, dest_bucket: str, dest_object: str
) -> bool:
"""Copy object between buckets/locations"""
try:
# pylint: disable=import-outside-toplevel
from minio.commonconfig import CopySource
# Ensure destination bucket exists
await self.ensure_bucket(dest_bucket)
# Copy object
self.client.copy_object(
bucket_name=dest_bucket,
object_name=dest_object,
source=CopySource(source_bucket, source_object),
)
logger.info(
"Object copied",
source_bucket=source_bucket,
source_object=source_object,
dest_bucket=dest_bucket,
dest_object=dest_object,
)
return True
except S3Error as e:
logger.error(
"Failed to copy object",
source_bucket=source_bucket,
source_object=source_object,
dest_bucket=dest_bucket,
dest_object=dest_object,
error=str(e),
)
return False

145
libs/storage/document.py Normal file
View File

@@ -0,0 +1,145 @@
"""High-level document storage operations."""
import hashlib
import json
from io import BytesIO
from typing import Any
import structlog
from .client import StorageClient
logger = structlog.get_logger()
class DocumentStorage:
"""High-level document storage operations"""
def __init__(self, storage_client: StorageClient):
self.storage = storage_client
async def store_document( # pylint: disable=too-many-arguments,too-many-positional-arguments
self,
tenant_id: str,
doc_id: str,
content: bytes,
content_type: str = "application/pdf",
metadata: dict[str, str] | None = None,
bucket_name: str = "raw-documents",
) -> dict[str, Any]:
"""Store document with metadata"""
# Calculate checksum
checksum = hashlib.sha256(content).hexdigest()
# Prepare metadata
doc_metadata = {
"tenant_id": tenant_id,
"doc_id": doc_id,
"checksum": checksum,
"size": str(len(content)),
**(metadata or {}),
}
# Determine bucket and key
object_key = f"tenants/{tenant_id}/raw/{doc_id}.pdf"
# Upload to storage
success = await self.storage.put_object(
bucket_name=bucket_name,
object_name=object_key,
data=BytesIO(content),
length=len(content),
content_type=content_type,
metadata=doc_metadata,
)
if success:
return {
"bucket": bucket_name,
"key": object_key,
"checksum": checksum,
"size": len(content),
"s3_url": f"s3://{bucket_name}/{object_key}",
}
raise RuntimeError("Failed to store document")
async def store_ocr_result(
self, tenant_id: str, doc_id: str, ocr_data: dict[str, Any]
) -> str:
"""Store OCR results as JSON"""
bucket_name = "evidence"
object_key = f"tenants/{tenant_id}/ocr/{doc_id}.json"
# Convert to JSON bytes
json_data = json.dumps(ocr_data, indent=2).encode("utf-8")
# Upload to storage
success = await self.storage.put_object(
bucket_name=bucket_name,
object_name=object_key,
data=BytesIO(json_data),
length=len(json_data),
content_type="application/json",
)
if success:
return f"s3://{bucket_name}/{object_key}"
raise RuntimeError("Failed to store OCR result")
async def store_extraction_result(
self, tenant_id: str, doc_id: str, extraction_data: dict[str, Any]
) -> str:
"""Store extraction results as JSON"""
bucket_name = "evidence"
object_key = f"tenants/{tenant_id}/extractions/{doc_id}.json"
# Convert to JSON bytes
json_data = json.dumps(extraction_data, indent=2).encode("utf-8")
# Upload to storage
success = await self.storage.put_object(
bucket_name=bucket_name,
object_name=object_key,
data=BytesIO(json_data),
length=len(json_data),
content_type="application/json",
)
if success:
return f"s3://{bucket_name}/{object_key}"
raise RuntimeError("Failed to store extraction result")
async def get_document(self, tenant_id: str, doc_id: str) -> bytes | None:
"""Retrieve document content"""
bucket_name = "raw-documents"
object_key = f"tenants/{tenant_id}/raw/{doc_id}.pdf"
return await self.storage.get_object(bucket_name, object_key)
async def get_ocr_result(
self, tenant_id: str, doc_id: str
) -> dict[str, Any] | None:
"""Retrieve OCR results"""
bucket_name = "evidence"
object_key = f"tenants/{tenant_id}/ocr/{doc_id}.json"
data = await self.storage.get_object(bucket_name, object_key)
if data:
return json.loads(data.decode("utf-8")) # type: ignore
return None
async def get_extraction_result(
self, tenant_id: str, doc_id: str
) -> dict[str, Any] | None:
"""Retrieve extraction results"""
bucket_name = "evidence"
object_key = f"tenants/{tenant_id}/extractions/{doc_id}.json"
data = await self.storage.get_object(bucket_name, object_key)
if data:
return json.loads(data.decode("utf-8")) # type: ignore
return None