# FILE: retrieval/chunking.yaml # Layout-aware chunking, tables, overlap, token targets chunking_strategy: default: chunk_size: 1500 # tokens overlap_percentage: 0.12 # 12% overlap min_chunk_size: 300 max_chunk_size: 2000 by_document_type: legislation: chunk_size: 2000 # Longer chunks for legal text overlap_percentage: 0.15 preserve_sections: true section_headers: ["Section", "Subsection", "Paragraph", "Article"] best_practices: chunk_size: 1200 overlap_percentage: 0.10 preserve_lists: true glossary: chunk_size: 800 # Shorter for definitions overlap_percentage: 0.05 preserve_definitions: true firm_knowledge: chunk_size: 1500 overlap_percentage: 0.12 preserve_procedures: true layout_awareness: table_handling: strategy: "serialize_structured" max_table_size: 50 # rows column_separator: " | " row_separator: "\n" preserve_headers: true include_table_context: true # Include surrounding text list_handling: preserve_structure: true bullet_points: ["•", "-", "*", "1.", "a.", "i."] nested_indentation: true heading_hierarchy: preserve_levels: true max_heading_level: 6 include_parent_headings: true # For context paragraph_boundaries: respect_boundaries: true min_paragraph_length: 50 # characters merge_short_paragraphs: true text_preprocessing: normalization: unicode_normalization: "NFKC" remove_extra_whitespace: true standardize_quotes: true fix_encoding_issues: true pii_handling: de_identify_before_chunking: true placeholder_format: "[{type}_{hash}]" pii_types: - "UTR" - "NI_NUMBER" - "IBAN" - "SORT_CODE" - "PHONE" - "EMAIL" - "POSTCODE" - "NAME" hash_algorithm: "sha256" hash_truncate: 8 # characters legal_text_handling: preserve_citations: true citation_patterns: - "Section \\d+[A-Z]?" - "Regulation \\d+" - "Schedule \\d+" - "Paragraph \\d+" preserve_cross_references: true chunking_rules: sentence_boundary_detection: use_spacy: true model: "en_core_web_sm" custom_abbreviations: - "Ltd" - "PLC" - "HMRC" - "UTR" - "NIC" - "PAYE" - "VAT" semantic_coherence: avoid_splitting: - "calculation_examples" - "step_by_step_procedures" - "form_instructions" - "definition_blocks" overlap_strategy: method: "sliding_window" overlap_unit: "sentences" # vs "tokens" or "characters" preserve_context: true include_metadata_overlap: false metadata_enrichment: chunk_metadata: - "source_document_id" - "source_document_type" - "chunk_index" - "total_chunks" - "page_numbers" - "section_hierarchy" - "table_count" - "list_count" - "has_calculations" - "jurisdiction" - "tax_years" - "topic_tags" - "confidence_score" - "pii_free" content_analysis: extract_entities: - "tax_concepts" - "form_references" - "calculation_methods" - "deadlines" - "thresholds" - "rates" topic_classification: use_keywords: true keyword_lists: employment: ["PAYE", "payslip", "P60", "employment", "salary", "wages"] self_employment: ["self-employed", "business", "turnover", "expenses", "profit"] property: ["rental", "property", "landlord", "FHL", "mortgage interest"] dividends: ["dividend", "shares", "distribution", "corporation tax"] capital_gains: ["capital gains", "disposal", "acquisition", "CGT"] quality_control: validation_rules: min_meaningful_content: 0.7 # Ratio of meaningful words max_repetition_ratio: 0.3 # Avoid highly repetitive chunks min_sentence_count: 2 max_sentence_count: 20 filtering: exclude_patterns: - "^\\s*$" # Empty chunks - "^Page \\d+$" # Page numbers only - "^\\[.*\\]$" # Placeholder-only chunks - "^Table of Contents" - "^Index$" post_processing: deduplicate_chunks: true similarity_threshold: 0.95 merge_similar_chunks: false # Keep separate for provenance output_format: chunk_structure: id: "uuid4" content: "string" metadata: "object" embeddings: "optional" # Added during indexing batch_processing: batch_size: 100 parallel_workers: 4 memory_limit_mb: 1024 storage: intermediate_format: "jsonl" compression: "gzip" include_source_mapping: true performance_tuning: caching: cache_preprocessed: true cache_embeddings: false # Too large cache_metadata: true ttl_hours: 24 optimization: use_multiprocessing: true chunk_size_adaptation: true # Adjust based on content type early_stopping: true # For very long documents monitoring: track_processing_time: true track_chunk_quality_scores: true alert_on_failures: true log_statistics: true