recovered config
Some checks failed
CI/CD Pipeline / Code Quality & Linting (push) Has been cancelled
CI/CD Pipeline / Policy Validation (push) Has been cancelled
CI/CD Pipeline / Test Suite (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-firm-connectors) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-forms) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-hmrc) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ingestion) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-normalize-map) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ocr) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-indexer) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-reason) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rpa) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (ui-review) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (ui-review) (push) Has been cancelled
CI/CD Pipeline / Generate SBOM (push) Has been cancelled
CI/CD Pipeline / Deploy to Staging (push) Has been cancelled
CI/CD Pipeline / Deploy to Production (push) Has been cancelled
CI/CD Pipeline / Notifications (push) Has been cancelled

This commit is contained in:
harkon
2025-10-16 08:57:14 +01:00
parent eea46ac89c
commit 8fe5e62fee
14 changed files with 775 additions and 1000 deletions

View File

@@ -16,9 +16,10 @@ import yaml
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, PointStruct, SparseVector, VectorParams
from sentence_transformers import SentenceTransformer
from spacy.tokens import Doc
from .chunker import DocumentChunker
from .pii_detector import PIIDetector, PIIRedactor
from .pii_detector import PIIDetector
@dataclass
@@ -39,7 +40,6 @@ class RAGIndexer:
self.qdrant_client = QdrantClient(url=qdrant_url)
self.chunker = DocumentChunker(config_path)
self.pii_detector = PIIDetector()
self.pii_redactor = PIIRedactor()
# Initialize embedding models
self.dense_model = SentenceTransformer(
@@ -54,13 +54,13 @@ class RAGIndexer:
self.logger = logging.getLogger(__name__)
def _init_sparse_model(self):
def _init_sparse_model(self) -> Any | dict[str, Any]:
"""Initialize sparse embedding model (BM25 or SPLADE)"""
sparse_config = self.config.get("sparse_model", {})
model_type = sparse_config.get("type", "bm25")
if model_type == "bm25":
from rank_bm25 import BM25Okapi
from rank_bm25 import BM25Okapi # type: ignore
return BM25Okapi
elif model_type == "splade":
@@ -142,13 +142,11 @@ class RAGIndexer:
# Step 1: De-identify PII
content = chunk["content"]
pii_detected = self.pii_detector.detect(content)
pii_detected = self.pii_detector.detect_pii(content)
if pii_detected:
# Redact PII and create mapping
redacted_content, pii_mapping = self.pii_redactor.redact(
content, pii_detected
)
redacted_content, pii_mapping = self.pii_detector.de_identify_text(content)
# Store PII mapping securely (not in vector DB)
await self._store_pii_mapping(chunk["id"], pii_mapping)
@@ -216,7 +214,7 @@ class RAGIndexer:
]
# Create term frequency vector
term_freq = {}
term_freq: dict[str, int] = {}
for token in tokens:
term_freq[token] = term_freq.get(token, 0) + 1
@@ -378,7 +376,7 @@ class RAGIndexer:
"language": doc.lang_ if hasattr(doc, "lang_") else "en",
}
def _calculate_complexity(self, doc: dict) -> float:
def _calculate_complexity(self, doc: Doc) -> float:
"""Calculate text complexity score"""
if not doc:
return 0.0