recovered config
Some checks failed
CI/CD Pipeline / Code Quality & Linting (push) Has been cancelled
CI/CD Pipeline / Policy Validation (push) Has been cancelled
CI/CD Pipeline / Test Suite (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-firm-connectors) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-forms) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-hmrc) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ingestion) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-normalize-map) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ocr) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-indexer) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-reason) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rpa) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (ui-review) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (ui-review) (push) Has been cancelled
CI/CD Pipeline / Generate SBOM (push) Has been cancelled
CI/CD Pipeline / Deploy to Staging (push) Has been cancelled
CI/CD Pipeline / Deploy to Production (push) Has been cancelled
CI/CD Pipeline / Notifications (push) Has been cancelled
Some checks failed
CI/CD Pipeline / Code Quality & Linting (push) Has been cancelled
CI/CD Pipeline / Policy Validation (push) Has been cancelled
CI/CD Pipeline / Test Suite (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-firm-connectors) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-forms) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-hmrc) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ingestion) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-normalize-map) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ocr) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-indexer) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-reason) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rpa) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (ui-review) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (ui-review) (push) Has been cancelled
CI/CD Pipeline / Generate SBOM (push) Has been cancelled
CI/CD Pipeline / Deploy to Staging (push) Has been cancelled
CI/CD Pipeline / Deploy to Production (push) Has been cancelled
CI/CD Pipeline / Notifications (push) Has been cancelled
This commit is contained in:
@@ -1,5 +1,6 @@
|
||||
"""Qdrant collections CRUD, hybrid search, rerank wrapper, de-identification utilities."""
|
||||
|
||||
from .chunker import DocumentChunker
|
||||
from .collection_manager import QdrantCollectionManager
|
||||
from .pii_detector import PIIDetector
|
||||
from .retriever import RAGRetriever
|
||||
@@ -10,4 +11,5 @@ __all__ = [
|
||||
"QdrantCollectionManager",
|
||||
"RAGRetriever",
|
||||
"rag_search_for_citations",
|
||||
"DocumentChunker",
|
||||
]
|
||||
|
||||
134
libs/rag/chunker.py
Normal file
134
libs/rag/chunker.py
Normal file
@@ -0,0 +1,134 @@
|
||||
"""Simple document chunker for RAG indexing.
|
||||
|
||||
Splits documents into manageable chunks using configuration options.
|
||||
Supports text files directly and PDFs via pdfplumber when available.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import yaml
|
||||
|
||||
|
||||
@dataclass
|
||||
class ChunkerConfig:
|
||||
chunk_size: int = 1000
|
||||
chunk_overlap: int = 100
|
||||
max_chunks: int = 1000
|
||||
|
||||
|
||||
class DocumentChunker:
|
||||
def __init__(self, config_path: str) -> None:
|
||||
try:
|
||||
with open(config_path, "r", encoding="utf-8") as f:
|
||||
cfg = yaml.safe_load(f) or {}
|
||||
except Exception:
|
||||
cfg = {}
|
||||
|
||||
rcfg = cfg.get("chunking", {}) if isinstance(cfg, dict) else {}
|
||||
self.config = ChunkerConfig(
|
||||
chunk_size=int(rcfg.get("chunk_size", 1000)),
|
||||
chunk_overlap=int(rcfg.get("chunk_overlap", 100)),
|
||||
max_chunks=int(rcfg.get("max_chunks", 1000)),
|
||||
)
|
||||
|
||||
async def chunk_document(self, document_path: str, metadata: dict[str, Any]) -> list[dict[str, Any]]:
|
||||
path = Path(document_path)
|
||||
ext = path.suffix.lower()
|
||||
|
||||
if ext == ".pdf":
|
||||
return await self._chunk_pdf(path, metadata)
|
||||
else:
|
||||
return await self._chunk_text_like(path, metadata)
|
||||
|
||||
async def _chunk_pdf(self, path: Path, metadata: dict[str, Any]) -> list[dict[str, Any]]:
|
||||
chunks: list[dict[str, Any]] = []
|
||||
try:
|
||||
import pdfplumber # type: ignore
|
||||
|
||||
with pdfplumber.open(str(path)) as pdf:
|
||||
total_pages = len(pdf.pages)
|
||||
doc_id = metadata.get("doc_id") or path.stem
|
||||
for i, page in enumerate(pdf.pages, start=1):
|
||||
text = page.extract_text() or ""
|
||||
if not text.strip():
|
||||
continue
|
||||
for j, content in enumerate(self._split_text(text), start=0):
|
||||
cid = f"{doc_id}-p{i}-c{j}"
|
||||
chunks.append(
|
||||
{
|
||||
"id": cid,
|
||||
"document_id": doc_id,
|
||||
"content": content,
|
||||
"chunk_index": j,
|
||||
"total_chunks": total_pages,
|
||||
"page_numbers": [i],
|
||||
"section_hierarchy": [],
|
||||
"confidence_score": 1.0,
|
||||
}
|
||||
)
|
||||
if len(chunks) >= self.config.max_chunks:
|
||||
return chunks
|
||||
except Exception:
|
||||
# Fallback: treat as binary and produce a single empty chunk to avoid crashes
|
||||
chunks.append(
|
||||
{
|
||||
"id": f"{path.stem}-p1-c0",
|
||||
"document_id": path.stem,
|
||||
"content": "",
|
||||
"chunk_index": 0,
|
||||
"total_chunks": 1,
|
||||
"page_numbers": [1],
|
||||
"section_hierarchy": [],
|
||||
"confidence_score": 0.0,
|
||||
}
|
||||
)
|
||||
return chunks
|
||||
|
||||
async def _chunk_text_like(self, path: Path, metadata: dict[str, Any]) -> list[dict[str, Any]]:
|
||||
try:
|
||||
text = path.read_text(encoding="utf-8", errors="ignore")
|
||||
except Exception:
|
||||
# As a last resort, read bytes and decode best-effort
|
||||
data = path.read_bytes()
|
||||
text = data.decode("utf-8", errors="ignore")
|
||||
|
||||
doc_id = metadata.get("doc_id") or path.stem
|
||||
pieces = self._split_text(text)
|
||||
chunks: list[dict[str, Any]] = []
|
||||
total = min(len(pieces), self.config.max_chunks)
|
||||
for i, content in enumerate(pieces[: total]):
|
||||
chunks.append(
|
||||
{
|
||||
"id": f"{doc_id}-c{i}",
|
||||
"document_id": doc_id,
|
||||
"content": content,
|
||||
"chunk_index": i,
|
||||
"total_chunks": total,
|
||||
"page_numbers": [],
|
||||
"section_hierarchy": [],
|
||||
"confidence_score": 1.0,
|
||||
}
|
||||
)
|
||||
return chunks
|
||||
|
||||
def _split_text(self, text: str) -> list[str]:
|
||||
size = max(self.config.chunk_size, 1)
|
||||
overlap = max(min(self.config.chunk_overlap, size - 1), 0)
|
||||
|
||||
if not text:
|
||||
return [""]
|
||||
|
||||
chunks: list[str] = []
|
||||
start = 0
|
||||
n = len(text)
|
||||
step = size - overlap if size > overlap else size
|
||||
while start < n and len(chunks) < self.config.max_chunks:
|
||||
end = min(start + size, n)
|
||||
chunks.append(text[start:end])
|
||||
start += step
|
||||
return chunks
|
||||
|
||||
@@ -16,9 +16,10 @@ import yaml
|
||||
from qdrant_client import QdrantClient
|
||||
from qdrant_client.models import Distance, PointStruct, SparseVector, VectorParams
|
||||
from sentence_transformers import SentenceTransformer
|
||||
from spacy.tokens import Doc
|
||||
|
||||
from .chunker import DocumentChunker
|
||||
from .pii_detector import PIIDetector, PIIRedactor
|
||||
from .pii_detector import PIIDetector
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -39,7 +40,6 @@ class RAGIndexer:
|
||||
self.qdrant_client = QdrantClient(url=qdrant_url)
|
||||
self.chunker = DocumentChunker(config_path)
|
||||
self.pii_detector = PIIDetector()
|
||||
self.pii_redactor = PIIRedactor()
|
||||
|
||||
# Initialize embedding models
|
||||
self.dense_model = SentenceTransformer(
|
||||
@@ -54,13 +54,13 @@ class RAGIndexer:
|
||||
|
||||
self.logger = logging.getLogger(__name__)
|
||||
|
||||
def _init_sparse_model(self):
|
||||
def _init_sparse_model(self) -> Any | dict[str, Any]:
|
||||
"""Initialize sparse embedding model (BM25 or SPLADE)"""
|
||||
sparse_config = self.config.get("sparse_model", {})
|
||||
model_type = sparse_config.get("type", "bm25")
|
||||
|
||||
if model_type == "bm25":
|
||||
from rank_bm25 import BM25Okapi
|
||||
from rank_bm25 import BM25Okapi # type: ignore
|
||||
|
||||
return BM25Okapi
|
||||
elif model_type == "splade":
|
||||
@@ -142,13 +142,11 @@ class RAGIndexer:
|
||||
|
||||
# Step 1: De-identify PII
|
||||
content = chunk["content"]
|
||||
pii_detected = self.pii_detector.detect(content)
|
||||
pii_detected = self.pii_detector.detect_pii(content)
|
||||
|
||||
if pii_detected:
|
||||
# Redact PII and create mapping
|
||||
redacted_content, pii_mapping = self.pii_redactor.redact(
|
||||
content, pii_detected
|
||||
)
|
||||
redacted_content, pii_mapping = self.pii_detector.de_identify_text(content)
|
||||
|
||||
# Store PII mapping securely (not in vector DB)
|
||||
await self._store_pii_mapping(chunk["id"], pii_mapping)
|
||||
@@ -216,7 +214,7 @@ class RAGIndexer:
|
||||
]
|
||||
|
||||
# Create term frequency vector
|
||||
term_freq = {}
|
||||
term_freq: dict[str, int] = {}
|
||||
for token in tokens:
|
||||
term_freq[token] = term_freq.get(token, 0) + 1
|
||||
|
||||
@@ -378,7 +376,7 @@ class RAGIndexer:
|
||||
"language": doc.lang_ if hasattr(doc, "lang_") else "en",
|
||||
}
|
||||
|
||||
def _calculate_complexity(self, doc: dict) -> float:
|
||||
def _calculate_complexity(self, doc: Doc) -> float:
|
||||
"""Calculate text complexity score"""
|
||||
if not doc:
|
||||
return 0.0
|
||||
|
||||
Reference in New Issue
Block a user