recovered config

2025-10-16 08:57:14 +01:00
parent eea46ac89c
commit 8fe5e62fee
14 changed files with 775 additions and 1000 deletions
--- a/libs/rag/init.py
+++ b/libs/rag/init.py
@@ -1,5 +1,6 @@
 """Qdrant collections CRUD, hybrid search, rerank wrapper, de-identification utilities."""

+from .chunker import DocumentChunker
 from .collection_manager import QdrantCollectionManager
 from .pii_detector import PIIDetector
 from .retriever import RAGRetriever
@@ -10,4 +11,5 @@ __all__ = [
    "QdrantCollectionManager",
    "RAGRetriever",
    "rag_search_for_citations",
+    "DocumentChunker",
 ]
--- a/libs/rag/chunker.py
+++ b/libs/rag/chunker.py
@@ -0,0 +1,134 @@
+"""Simple document chunker for RAG indexing.
+
+Splits documents into manageable chunks using configuration options.
+Supports text files directly and PDFs via pdfplumber when available.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+
+import yaml
+
+
+@dataclass
+class ChunkerConfig:
+    chunk_size: int = 1000
+    chunk_overlap: int = 100
+    max_chunks: int = 1000
+
+
+class DocumentChunker:
+    def __init__(self, config_path: str) -> None:
+        try:
+            with open(config_path, "r", encoding="utf-8") as f:
+                cfg = yaml.safe_load(f) or {}
+        except Exception:
+            cfg = {}
+
+        rcfg = cfg.get("chunking", {}) if isinstance(cfg, dict) else {}
+        self.config = ChunkerConfig(
+            chunk_size=int(rcfg.get("chunk_size", 1000)),
+            chunk_overlap=int(rcfg.get("chunk_overlap", 100)),
+            max_chunks=int(rcfg.get("max_chunks", 1000)),
+        )
+
+    async def chunk_document(self, document_path: str, metadata: dict[str, Any]) -> list[dict[str, Any]]:
+        path = Path(document_path)
+        ext = path.suffix.lower()
+
+        if ext == ".pdf":
+            return await self._chunk_pdf(path, metadata)
+        else:
+            return await self._chunk_text_like(path, metadata)
+
+    async def _chunk_pdf(self, path: Path, metadata: dict[str, Any]) -> list[dict[str, Any]]:
+        chunks: list[dict[str, Any]] = []
+        try:
+            import pdfplumber  # type: ignore
+
+            with pdfplumber.open(str(path)) as pdf:
+                total_pages = len(pdf.pages)
+                doc_id = metadata.get("doc_id") or path.stem
+                for i, page in enumerate(pdf.pages, start=1):
+                    text = page.extract_text() or ""
+                    if not text.strip():
+                        continue
+                    for j, content in enumerate(self._split_text(text), start=0):
+                        cid = f"{doc_id}-p{i}-c{j}"
+                        chunks.append(
+                            {
+                                "id": cid,
+                                "document_id": doc_id,
+                                "content": content,
+                                "chunk_index": j,
+                                "total_chunks": total_pages,
+                                "page_numbers": [i],
+                                "section_hierarchy": [],
+                                "confidence_score": 1.0,
+                            }
+                        )
+                        if len(chunks) >= self.config.max_chunks:
+                            return chunks
+        except Exception:
+            # Fallback: treat as binary and produce a single empty chunk to avoid crashes
+            chunks.append(
+                {
+                    "id": f"{path.stem}-p1-c0",
+                    "document_id": path.stem,
+                    "content": "",
+                    "chunk_index": 0,
+                    "total_chunks": 1,
+                    "page_numbers": [1],
+                    "section_hierarchy": [],
+                    "confidence_score": 0.0,
+                }
+            )
+        return chunks
+
+    async def _chunk_text_like(self, path: Path, metadata: dict[str, Any]) -> list[dict[str, Any]]:
+        try:
+            text = path.read_text(encoding="utf-8", errors="ignore")
+        except Exception:
+            # As a last resort, read bytes and decode best-effort
+            data = path.read_bytes()
+            text = data.decode("utf-8", errors="ignore")
+
+        doc_id = metadata.get("doc_id") or path.stem
+        pieces = self._split_text(text)
+        chunks: list[dict[str, Any]] = []
+        total = min(len(pieces), self.config.max_chunks)
+        for i, content in enumerate(pieces[: total]):
+            chunks.append(
+                {
+                    "id": f"{doc_id}-c{i}",
+                    "document_id": doc_id,
+                    "content": content,
+                    "chunk_index": i,
+                    "total_chunks": total,
+                    "page_numbers": [],
+                    "section_hierarchy": [],
+                    "confidence_score": 1.0,
+                }
+            )
+        return chunks
+
+    def _split_text(self, text: str) -> list[str]:
+        size = max(self.config.chunk_size, 1)
+        overlap = max(min(self.config.chunk_overlap, size - 1), 0)
+
+        if not text:
+            return [""]
+
+        chunks: list[str] = []
+        start = 0
+        n = len(text)
+        step = size - overlap if size > overlap else size
+        while start < n and len(chunks) < self.config.max_chunks:
+            end = min(start + size, n)
+            chunks.append(text[start:end])
+            start += step
+        return chunks
+
--- a/libs/rag/indexer.py
+++ b/libs/rag/indexer.py
@@ -16,9 +16,10 @@ import yaml
 from qdrant_client import QdrantClient
 from qdrant_client.models import Distance, PointStruct, SparseVector, VectorParams
 from sentence_transformers import SentenceTransformer
+from spacy.tokens import Doc

 from .chunker import DocumentChunker
-from .pii_detector import PIIDetector, PIIRedactor
+from .pii_detector import PIIDetector


@dataclass
@@ -39,7 +40,6 @@ class RAGIndexer:
        self.qdrant_client = QdrantClient(url=qdrant_url)
        self.chunker = DocumentChunker(config_path)
        self.pii_detector = PIIDetector()
-        self.pii_redactor = PIIRedactor()

        # Initialize embedding models
        self.dense_model = SentenceTransformer(
@@ -54,13 +54,13 @@ class RAGIndexer:

        self.logger = logging.getLogger(__name__)

-    def _init_sparse_model(self):
+    def _init_sparse_model(self) -> Any | dict[str, Any]:
        """Initialize sparse embedding model (BM25 or SPLADE)"""
        sparse_config = self.config.get("sparse_model", {})
        model_type = sparse_config.get("type", "bm25")

        if model_type == "bm25":
-            from rank_bm25 import BM25Okapi
+            from rank_bm25 import BM25Okapi  # type: ignore

            return BM25Okapi
        elif model_type == "splade":
@@ -142,13 +142,11 @@ class RAGIndexer:

        # Step 1: De-identify PII
        content = chunk["content"]
-        pii_detected = self.pii_detector.detect(content)
+        pii_detected = self.pii_detector.detect_pii(content)

        if pii_detected:
            # Redact PII and create mapping
-            redacted_content, pii_mapping = self.pii_redactor.redact(
-                content, pii_detected
-            )
+            redacted_content, pii_mapping = self.pii_detector.de_identify_text(content)

            # Store PII mapping securely (not in vector DB)
            await self._store_pii_mapping(chunk["id"], pii_mapping)
@@ -216,7 +214,7 @@ class RAGIndexer:
                ]

                # Create term frequency vector
-                term_freq = {}
+                term_freq: dict[str, int] = {}
                for token in tokens:
                    term_freq[token] = term_freq.get(token, 0) + 1

@@ -378,7 +376,7 @@ class RAGIndexer:
            "language": doc.lang_ if hasattr(doc, "lang_") else "en",
        }

-    def _calculate_complexity(self, doc: dict) -> float:
+    def _calculate_complexity(self, doc: Doc) -> float:
        """Calculate text complexity score"""
        if not doc:
            return 0.0