Some checks failed
CI/CD Pipeline / Code Quality & Linting (push) Has been cancelled
CI/CD Pipeline / Policy Validation (push) Has been cancelled
CI/CD Pipeline / Test Suite (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-firm-connectors) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-forms) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-hmrc) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ingestion) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-normalize-map) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ocr) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-indexer) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-reason) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rpa) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (ui-review) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (ui-review) (push) Has been cancelled
CI/CD Pipeline / Generate SBOM (push) Has been cancelled
CI/CD Pipeline / Deploy to Staging (push) Has been cancelled
CI/CD Pipeline / Deploy to Production (push) Has been cancelled
CI/CD Pipeline / Notifications (push) Has been cancelled
204 lines
4.9 KiB
YAML
204 lines
4.9 KiB
YAML
# FILE: retrieval/chunking.yaml
|
|
# Layout-aware chunking, tables, overlap, token targets
|
|
|
|
chunking_strategy:
|
|
default:
|
|
chunk_size: 1500 # tokens
|
|
overlap_percentage: 0.12 # 12% overlap
|
|
min_chunk_size: 300
|
|
max_chunk_size: 2000
|
|
|
|
by_document_type:
|
|
legislation:
|
|
chunk_size: 2000 # Longer chunks for legal text
|
|
overlap_percentage: 0.15
|
|
preserve_sections: true
|
|
section_headers: ["Section", "Subsection", "Paragraph", "Article"]
|
|
|
|
best_practices:
|
|
chunk_size: 1200
|
|
overlap_percentage: 0.10
|
|
preserve_lists: true
|
|
|
|
glossary:
|
|
chunk_size: 800 # Shorter for definitions
|
|
overlap_percentage: 0.05
|
|
preserve_definitions: true
|
|
|
|
firm_knowledge:
|
|
chunk_size: 1500
|
|
overlap_percentage: 0.12
|
|
preserve_procedures: true
|
|
|
|
layout_awareness:
|
|
table_handling:
|
|
strategy: "serialize_structured"
|
|
max_table_size: 50 # rows
|
|
column_separator: " | "
|
|
row_separator: "\n"
|
|
preserve_headers: true
|
|
include_table_context: true # Include surrounding text
|
|
|
|
list_handling:
|
|
preserve_structure: true
|
|
bullet_points: ["•", "-", "*", "1.", "a.", "i."]
|
|
nested_indentation: true
|
|
|
|
heading_hierarchy:
|
|
preserve_levels: true
|
|
max_heading_level: 6
|
|
include_parent_headings: true # For context
|
|
|
|
paragraph_boundaries:
|
|
respect_boundaries: true
|
|
min_paragraph_length: 50 # characters
|
|
merge_short_paragraphs: true
|
|
|
|
text_preprocessing:
|
|
normalization:
|
|
unicode_normalization: "NFKC"
|
|
remove_extra_whitespace: true
|
|
standardize_quotes: true
|
|
fix_encoding_issues: true
|
|
|
|
pii_handling:
|
|
de_identify_before_chunking: true
|
|
placeholder_format: "[{type}_{hash}]"
|
|
pii_types:
|
|
- "UTR"
|
|
- "NI_NUMBER"
|
|
- "IBAN"
|
|
- "SORT_CODE"
|
|
- "PHONE"
|
|
- "EMAIL"
|
|
- "POSTCODE"
|
|
- "NAME"
|
|
hash_algorithm: "sha256"
|
|
hash_truncate: 8 # characters
|
|
|
|
legal_text_handling:
|
|
preserve_citations: true
|
|
citation_patterns:
|
|
- "Section \\d+[A-Z]?"
|
|
- "Regulation \\d+"
|
|
- "Schedule \\d+"
|
|
- "Paragraph \\d+"
|
|
preserve_cross_references: true
|
|
|
|
chunking_rules:
|
|
sentence_boundary_detection:
|
|
use_spacy: true
|
|
model: "en_core_web_sm"
|
|
custom_abbreviations:
|
|
- "Ltd"
|
|
- "PLC"
|
|
- "HMRC"
|
|
- "UTR"
|
|
- "NIC"
|
|
- "PAYE"
|
|
- "VAT"
|
|
|
|
semantic_coherence:
|
|
avoid_splitting:
|
|
- "calculation_examples"
|
|
- "step_by_step_procedures"
|
|
- "form_instructions"
|
|
- "definition_blocks"
|
|
|
|
overlap_strategy:
|
|
method: "sliding_window"
|
|
overlap_unit: "sentences" # vs "tokens" or "characters"
|
|
preserve_context: true
|
|
include_metadata_overlap: false
|
|
|
|
metadata_enrichment:
|
|
chunk_metadata:
|
|
- "source_document_id"
|
|
- "source_document_type"
|
|
- "chunk_index"
|
|
- "total_chunks"
|
|
- "page_numbers"
|
|
- "section_hierarchy"
|
|
- "table_count"
|
|
- "list_count"
|
|
- "has_calculations"
|
|
- "jurisdiction"
|
|
- "tax_years"
|
|
- "topic_tags"
|
|
- "confidence_score"
|
|
- "pii_free"
|
|
|
|
content_analysis:
|
|
extract_entities:
|
|
- "tax_concepts"
|
|
- "form_references"
|
|
- "calculation_methods"
|
|
- "deadlines"
|
|
- "thresholds"
|
|
- "rates"
|
|
|
|
topic_classification:
|
|
use_keywords: true
|
|
keyword_lists:
|
|
employment: ["PAYE", "payslip", "P60", "employment", "salary", "wages"]
|
|
self_employment:
|
|
["self-employed", "business", "turnover", "expenses", "profit"]
|
|
property: ["rental", "property", "landlord", "FHL", "mortgage interest"]
|
|
dividends: ["dividend", "shares", "distribution", "corporation tax"]
|
|
capital_gains: ["capital gains", "disposal", "acquisition", "CGT"]
|
|
|
|
quality_control:
|
|
validation_rules:
|
|
min_meaningful_content: 0.7 # Ratio of meaningful words
|
|
max_repetition_ratio: 0.3 # Avoid highly repetitive chunks
|
|
min_sentence_count: 2
|
|
max_sentence_count: 20
|
|
|
|
filtering:
|
|
exclude_patterns:
|
|
- "^\\s*$" # Empty chunks
|
|
- "^Page \\d+$" # Page numbers only
|
|
- "^\\[.*\\]$" # Placeholder-only chunks
|
|
- "^Table of Contents"
|
|
- "^Index$"
|
|
|
|
post_processing:
|
|
deduplicate_chunks: true
|
|
similarity_threshold: 0.95
|
|
merge_similar_chunks: false # Keep separate for provenance
|
|
|
|
output_format:
|
|
chunk_structure:
|
|
id: "uuid4"
|
|
content: "string"
|
|
metadata: "object"
|
|
embeddings: "optional" # Added during indexing
|
|
|
|
batch_processing:
|
|
batch_size: 100
|
|
parallel_workers: 4
|
|
memory_limit_mb: 1024
|
|
|
|
storage:
|
|
intermediate_format: "jsonl"
|
|
compression: "gzip"
|
|
include_source_mapping: true
|
|
|
|
performance_tuning:
|
|
caching:
|
|
cache_preprocessed: true
|
|
cache_embeddings: false # Too large
|
|
cache_metadata: true
|
|
ttl_hours: 24
|
|
|
|
optimization:
|
|
use_multiprocessing: true
|
|
chunk_size_adaptation: true # Adjust based on content type
|
|
early_stopping: true # For very long documents
|
|
|
|
monitoring:
|
|
track_processing_time: true
|
|
track_chunk_quality_scores: true
|
|
alert_on_failures: true
|
|
log_statistics: true
|