"""Simple document chunker for RAG indexing. Splits documents into manageable chunks using configuration options. Supports text files directly and PDFs via pdfplumber when available. """ from __future__ import annotations from dataclasses import dataclass from pathlib import Path from typing import Any import yaml @dataclass class ChunkerConfig: chunk_size: int = 1000 chunk_overlap: int = 100 max_chunks: int = 1000 class DocumentChunker: def __init__(self, config_path: str) -> None: try: with open(config_path, "r", encoding="utf-8") as f: cfg = yaml.safe_load(f) or {} except Exception: cfg = {} rcfg = cfg.get("chunking", {}) if isinstance(cfg, dict) else {} self.config = ChunkerConfig( chunk_size=int(rcfg.get("chunk_size", 1000)), chunk_overlap=int(rcfg.get("chunk_overlap", 100)), max_chunks=int(rcfg.get("max_chunks", 1000)), ) async def chunk_document(self, document_path: str, metadata: dict[str, Any]) -> list[dict[str, Any]]: path = Path(document_path) ext = path.suffix.lower() if ext == ".pdf": return await self._chunk_pdf(path, metadata) else: return await self._chunk_text_like(path, metadata) async def _chunk_pdf(self, path: Path, metadata: dict[str, Any]) -> list[dict[str, Any]]: chunks: list[dict[str, Any]] = [] try: import pdfplumber # type: ignore with pdfplumber.open(str(path)) as pdf: total_pages = len(pdf.pages) doc_id = metadata.get("doc_id") or path.stem for i, page in enumerate(pdf.pages, start=1): text = page.extract_text() or "" if not text.strip(): continue for j, content in enumerate(self._split_text(text), start=0): cid = f"{doc_id}-p{i}-c{j}" chunks.append( { "id": cid, "document_id": doc_id, "content": content, "chunk_index": j, "total_chunks": total_pages, "page_numbers": [i], "section_hierarchy": [], "confidence_score": 1.0, } ) if len(chunks) >= self.config.max_chunks: return chunks except Exception: # Fallback: treat as binary and produce a single empty chunk to avoid crashes chunks.append( { "id": f"{path.stem}-p1-c0", "document_id": path.stem, "content": "", "chunk_index": 0, "total_chunks": 1, "page_numbers": [1], "section_hierarchy": [], "confidence_score": 0.0, } ) return chunks async def _chunk_text_like(self, path: Path, metadata: dict[str, Any]) -> list[dict[str, Any]]: try: text = path.read_text(encoding="utf-8", errors="ignore") except Exception: # As a last resort, read bytes and decode best-effort data = path.read_bytes() text = data.decode("utf-8", errors="ignore") doc_id = metadata.get("doc_id") or path.stem pieces = self._split_text(text) chunks: list[dict[str, Any]] = [] total = min(len(pieces), self.config.max_chunks) for i, content in enumerate(pieces[: total]): chunks.append( { "id": f"{doc_id}-c{i}", "document_id": doc_id, "content": content, "chunk_index": i, "total_chunks": total, "page_numbers": [], "section_hierarchy": [], "confidence_score": 1.0, } ) return chunks def _split_text(self, text: str) -> list[str]: size = max(self.config.chunk_size, 1) overlap = max(min(self.config.chunk_overlap, size - 1), 0) if not text: return [""] chunks: list[str] = [] start = 0 n = len(text) step = size - overlap if size > overlap else size while start < n and len(chunks) < self.config.max_chunks: end = min(start + size, n) chunks.append(text[start:end]) start += step return chunks