Files
ai-tax-agent/retrieval/chunking.yaml
harkon 8fe5e62fee
Some checks failed
CI/CD Pipeline / Code Quality & Linting (push) Has been cancelled
CI/CD Pipeline / Policy Validation (push) Has been cancelled
CI/CD Pipeline / Test Suite (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-firm-connectors) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-forms) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-hmrc) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ingestion) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-normalize-map) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ocr) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-indexer) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-reason) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rpa) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (ui-review) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (ui-review) (push) Has been cancelled
CI/CD Pipeline / Generate SBOM (push) Has been cancelled
CI/CD Pipeline / Deploy to Staging (push) Has been cancelled
CI/CD Pipeline / Deploy to Production (push) Has been cancelled
CI/CD Pipeline / Notifications (push) Has been cancelled
recovered config
2025-10-16 08:57:14 +01:00

204 lines
4.9 KiB
YAML

# FILE: retrieval/chunking.yaml
# Layout-aware chunking, tables, overlap, token targets
chunking_strategy:
default:
chunk_size: 1500 # tokens
overlap_percentage: 0.12 # 12% overlap
min_chunk_size: 300
max_chunk_size: 2000
by_document_type:
legislation:
chunk_size: 2000 # Longer chunks for legal text
overlap_percentage: 0.15
preserve_sections: true
section_headers: ["Section", "Subsection", "Paragraph", "Article"]
best_practices:
chunk_size: 1200
overlap_percentage: 0.10
preserve_lists: true
glossary:
chunk_size: 800 # Shorter for definitions
overlap_percentage: 0.05
preserve_definitions: true
firm_knowledge:
chunk_size: 1500
overlap_percentage: 0.12
preserve_procedures: true
layout_awareness:
table_handling:
strategy: "serialize_structured"
max_table_size: 50 # rows
column_separator: " | "
row_separator: "\n"
preserve_headers: true
include_table_context: true # Include surrounding text
list_handling:
preserve_structure: true
bullet_points: ["•", "-", "*", "1.", "a.", "i."]
nested_indentation: true
heading_hierarchy:
preserve_levels: true
max_heading_level: 6
include_parent_headings: true # For context
paragraph_boundaries:
respect_boundaries: true
min_paragraph_length: 50 # characters
merge_short_paragraphs: true
text_preprocessing:
normalization:
unicode_normalization: "NFKC"
remove_extra_whitespace: true
standardize_quotes: true
fix_encoding_issues: true
pii_handling:
de_identify_before_chunking: true
placeholder_format: "[{type}_{hash}]"
pii_types:
- "UTR"
- "NI_NUMBER"
- "IBAN"
- "SORT_CODE"
- "PHONE"
- "EMAIL"
- "POSTCODE"
- "NAME"
hash_algorithm: "sha256"
hash_truncate: 8 # characters
legal_text_handling:
preserve_citations: true
citation_patterns:
- "Section \\d+[A-Z]?"
- "Regulation \\d+"
- "Schedule \\d+"
- "Paragraph \\d+"
preserve_cross_references: true
chunking_rules:
sentence_boundary_detection:
use_spacy: true
model: "en_core_web_sm"
custom_abbreviations:
- "Ltd"
- "PLC"
- "HMRC"
- "UTR"
- "NIC"
- "PAYE"
- "VAT"
semantic_coherence:
avoid_splitting:
- "calculation_examples"
- "step_by_step_procedures"
- "form_instructions"
- "definition_blocks"
overlap_strategy:
method: "sliding_window"
overlap_unit: "sentences" # vs "tokens" or "characters"
preserve_context: true
include_metadata_overlap: false
metadata_enrichment:
chunk_metadata:
- "source_document_id"
- "source_document_type"
- "chunk_index"
- "total_chunks"
- "page_numbers"
- "section_hierarchy"
- "table_count"
- "list_count"
- "has_calculations"
- "jurisdiction"
- "tax_years"
- "topic_tags"
- "confidence_score"
- "pii_free"
content_analysis:
extract_entities:
- "tax_concepts"
- "form_references"
- "calculation_methods"
- "deadlines"
- "thresholds"
- "rates"
topic_classification:
use_keywords: true
keyword_lists:
employment: ["PAYE", "payslip", "P60", "employment", "salary", "wages"]
self_employment:
["self-employed", "business", "turnover", "expenses", "profit"]
property: ["rental", "property", "landlord", "FHL", "mortgage interest"]
dividends: ["dividend", "shares", "distribution", "corporation tax"]
capital_gains: ["capital gains", "disposal", "acquisition", "CGT"]
quality_control:
validation_rules:
min_meaningful_content: 0.7 # Ratio of meaningful words
max_repetition_ratio: 0.3 # Avoid highly repetitive chunks
min_sentence_count: 2
max_sentence_count: 20
filtering:
exclude_patterns:
- "^\\s*$" # Empty chunks
- "^Page \\d+$" # Page numbers only
- "^\\[.*\\]$" # Placeholder-only chunks
- "^Table of Contents"
- "^Index$"
post_processing:
deduplicate_chunks: true
similarity_threshold: 0.95
merge_similar_chunks: false # Keep separate for provenance
output_format:
chunk_structure:
id: "uuid4"
content: "string"
metadata: "object"
embeddings: "optional" # Added during indexing
batch_processing:
batch_size: 100
parallel_workers: 4
memory_limit_mb: 1024
storage:
intermediate_format: "jsonl"
compression: "gzip"
include_source_mapping: true
performance_tuning:
caching:
cache_preprocessed: true
cache_embeddings: false # Too large
cache_metadata: true
ttl_hours: 24
optimization:
use_multiprocessing: true
chunk_size_adaptation: true # Adjust based on content type
early_stopping: true # For very long documents
monitoring:
track_processing_time: true
track_chunk_quality_scores: true
alert_on_failures: true
log_statistics: true