# FILE: retrieval/chunking.yaml
# Layout-aware chunking, tables, overlap, token targets

chunking_strategy:
  default:
    chunk_size: 1500 # tokens
    overlap_percentage: 0.12 # 12% overlap
    min_chunk_size: 300
    max_chunk_size: 2000

  by_document_type:
    legislation:
      chunk_size: 2000 # Longer chunks for legal text
      overlap_percentage: 0.15
      preserve_sections: true
      section_headers: ["Section", "Subsection", "Paragraph", "Article"]

    best_practices:
      chunk_size: 1200
      overlap_percentage: 0.10
      preserve_lists: true

    glossary:
      chunk_size: 800 # Shorter for definitions
      overlap_percentage: 0.05
      preserve_definitions: true

    firm_knowledge:
      chunk_size: 1500
      overlap_percentage: 0.12
      preserve_procedures: true

layout_awareness:
  table_handling:
    strategy: "serialize_structured"
    max_table_size: 50 # rows
    column_separator: " | "
    row_separator: "\n"
    preserve_headers: true
    include_table_context: true # Include surrounding text

  list_handling:
    preserve_structure: true
    bullet_points: ["•", "-", "*", "1.", "a.", "i."]
    nested_indentation: true

  heading_hierarchy:
    preserve_levels: true
    max_heading_level: 6
    include_parent_headings: true # For context

  paragraph_boundaries:
    respect_boundaries: true
    min_paragraph_length: 50 # characters
    merge_short_paragraphs: true

text_preprocessing:
  normalization:
    unicode_normalization: "NFKC"
    remove_extra_whitespace: true
    standardize_quotes: true
    fix_encoding_issues: true

  pii_handling:
    de_identify_before_chunking: true
    placeholder_format: "[{type}_{hash}]"
    pii_types:
      - "UTR"
      - "NI_NUMBER"
      - "IBAN"
      - "SORT_CODE"
      - "PHONE"
      - "EMAIL"
      - "POSTCODE"
      - "NAME"
    hash_algorithm: "sha256"
    hash_truncate: 8 # characters

  legal_text_handling:
    preserve_citations: true
    citation_patterns:
      - "Section \\d+[A-Z]?"
      - "Regulation \\d+"
      - "Schedule \\d+"
      - "Paragraph \\d+"
    preserve_cross_references: true

chunking_rules:
  sentence_boundary_detection:
    use_spacy: true
    model: "en_core_web_sm"
    custom_abbreviations:
      - "Ltd"
      - "PLC"
      - "HMRC"
      - "UTR"
      - "NIC"
      - "PAYE"
      - "VAT"

  semantic_coherence:
    avoid_splitting:
      - "calculation_examples"
      - "step_by_step_procedures"
      - "form_instructions"
      - "definition_blocks"

  overlap_strategy:
    method: "sliding_window"
    overlap_unit: "sentences" # vs "tokens" or "characters"
    preserve_context: true
    include_metadata_overlap: false

metadata_enrichment:
  chunk_metadata:
    - "source_document_id"
    - "source_document_type"
    - "chunk_index"
    - "total_chunks"
    - "page_numbers"
    - "section_hierarchy"
    - "table_count"
    - "list_count"
    - "has_calculations"
    - "jurisdiction"
    - "tax_years"
    - "topic_tags"
    - "confidence_score"
    - "pii_free"

  content_analysis:
    extract_entities:
      - "tax_concepts"
      - "form_references"
      - "calculation_methods"
      - "deadlines"
      - "thresholds"
      - "rates"

    topic_classification:
      use_keywords: true
      keyword_lists:
        employment: ["PAYE", "payslip", "P60", "employment", "salary", "wages"]
        self_employment:
          ["self-employed", "business", "turnover", "expenses", "profit"]
        property: ["rental", "property", "landlord", "FHL", "mortgage interest"]
        dividends: ["dividend", "shares", "distribution", "corporation tax"]
        capital_gains: ["capital gains", "disposal", "acquisition", "CGT"]

quality_control:
  validation_rules:
    min_meaningful_content: 0.7 # Ratio of meaningful words
    max_repetition_ratio: 0.3 # Avoid highly repetitive chunks
    min_sentence_count: 2
    max_sentence_count: 20

  filtering:
    exclude_patterns:
      - "^\\s*$" # Empty chunks
      - "^Page \\d+$" # Page numbers only
      - "^\\[.*\\]$" # Placeholder-only chunks
      - "^Table of Contents"
      - "^Index$"

  post_processing:
    deduplicate_chunks: true
    similarity_threshold: 0.95
    merge_similar_chunks: false # Keep separate for provenance

output_format:
  chunk_structure:
    id: "uuid4"
    content: "string"
    metadata: "object"
    embeddings: "optional" # Added during indexing

  batch_processing:
    batch_size: 100
    parallel_workers: 4
    memory_limit_mb: 1024

  storage:
    intermediate_format: "jsonl"
    compression: "gzip"
    include_source_mapping: true

performance_tuning:
  caching:
    cache_preprocessed: true
    cache_embeddings: false # Too large
    cache_metadata: true
    ttl_hours: 24

  optimization:
    use_multiprocessing: true
    chunk_size_adaptation: true # Adjust based on content type
    early_stopping: true # For very long documents

  monitoring:
    track_processing_time: true
    track_chunk_quality_scores: true
    alert_on_failures: true
    log_statistics: true