recovered config
Some checks failed
CI/CD Pipeline / Code Quality & Linting (push) Has been cancelled
CI/CD Pipeline / Policy Validation (push) Has been cancelled
CI/CD Pipeline / Test Suite (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-firm-connectors) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-forms) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-hmrc) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ingestion) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-normalize-map) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ocr) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-indexer) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-reason) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rpa) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (ui-review) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (ui-review) (push) Has been cancelled
CI/CD Pipeline / Generate SBOM (push) Has been cancelled
CI/CD Pipeline / Deploy to Staging (push) Has been cancelled
CI/CD Pipeline / Deploy to Production (push) Has been cancelled
CI/CD Pipeline / Notifications (push) Has been cancelled
Some checks failed
CI/CD Pipeline / Code Quality & Linting (push) Has been cancelled
CI/CD Pipeline / Policy Validation (push) Has been cancelled
CI/CD Pipeline / Test Suite (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-firm-connectors) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-forms) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-hmrc) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ingestion) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-normalize-map) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ocr) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-indexer) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-reason) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rpa) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (ui-review) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (ui-review) (push) Has been cancelled
CI/CD Pipeline / Generate SBOM (push) Has been cancelled
CI/CD Pipeline / Deploy to Staging (push) Has been cancelled
CI/CD Pipeline / Deploy to Production (push) Has been cancelled
CI/CD Pipeline / Notifications (push) Has been cancelled
This commit is contained in:
@@ -16,9 +16,10 @@ import yaml
|
||||
from qdrant_client import QdrantClient
|
||||
from qdrant_client.models import Distance, PointStruct, SparseVector, VectorParams
|
||||
from sentence_transformers import SentenceTransformer
|
||||
from spacy.tokens import Doc
|
||||
|
||||
from .chunker import DocumentChunker
|
||||
from .pii_detector import PIIDetector, PIIRedactor
|
||||
from .pii_detector import PIIDetector
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -39,7 +40,6 @@ class RAGIndexer:
|
||||
self.qdrant_client = QdrantClient(url=qdrant_url)
|
||||
self.chunker = DocumentChunker(config_path)
|
||||
self.pii_detector = PIIDetector()
|
||||
self.pii_redactor = PIIRedactor()
|
||||
|
||||
# Initialize embedding models
|
||||
self.dense_model = SentenceTransformer(
|
||||
@@ -54,13 +54,13 @@ class RAGIndexer:
|
||||
|
||||
self.logger = logging.getLogger(__name__)
|
||||
|
||||
def _init_sparse_model(self):
|
||||
def _init_sparse_model(self) -> Any | dict[str, Any]:
|
||||
"""Initialize sparse embedding model (BM25 or SPLADE)"""
|
||||
sparse_config = self.config.get("sparse_model", {})
|
||||
model_type = sparse_config.get("type", "bm25")
|
||||
|
||||
if model_type == "bm25":
|
||||
from rank_bm25 import BM25Okapi
|
||||
from rank_bm25 import BM25Okapi # type: ignore
|
||||
|
||||
return BM25Okapi
|
||||
elif model_type == "splade":
|
||||
@@ -142,13 +142,11 @@ class RAGIndexer:
|
||||
|
||||
# Step 1: De-identify PII
|
||||
content = chunk["content"]
|
||||
pii_detected = self.pii_detector.detect(content)
|
||||
pii_detected = self.pii_detector.detect_pii(content)
|
||||
|
||||
if pii_detected:
|
||||
# Redact PII and create mapping
|
||||
redacted_content, pii_mapping = self.pii_redactor.redact(
|
||||
content, pii_detected
|
||||
)
|
||||
redacted_content, pii_mapping = self.pii_detector.de_identify_text(content)
|
||||
|
||||
# Store PII mapping securely (not in vector DB)
|
||||
await self._store_pii_mapping(chunk["id"], pii_mapping)
|
||||
@@ -216,7 +214,7 @@ class RAGIndexer:
|
||||
]
|
||||
|
||||
# Create term frequency vector
|
||||
term_freq = {}
|
||||
term_freq: dict[str, int] = {}
|
||||
for token in tokens:
|
||||
term_freq[token] = term_freq.get(token, 0) + 1
|
||||
|
||||
@@ -378,7 +376,7 @@ class RAGIndexer:
|
||||
"language": doc.lang_ if hasattr(doc, "lang_") else "en",
|
||||
}
|
||||
|
||||
def _calculate_complexity(self, doc: dict) -> float:
|
||||
def _calculate_complexity(self, doc: Doc) -> float:
|
||||
"""Calculate text complexity score"""
|
||||
if not doc:
|
||||
return 0.0
|
||||
|
||||
Reference in New Issue
Block a user