# FILE: config/heuristics.yaml document_kinds: bank_statement: patterns: - "statement of account" - "current account" - "savings account" - "sort code: \\d{2}-\\d{2}-\\d{2}" classifiers: - has_sort_code_pattern - has_account_number - has_transaction_table invoice: patterns: - "invoice" - "tax invoice" - "vat invoice" - "invoice number" classifiers: - has_vat_number - has_invoice_number - has_line_items receipt: patterns: - "receipt" - "till receipt" - "card payment" classifiers: - has_merchant_name - has_payment_method payslip: patterns: - "payslip" - "pay advice" - "salary statement" - "paye" classifiers: - has_employer_name - has_ni_contributions - has_tax_code p60: patterns: - "p60" - "end of year certificate" classifiers: - has_tax_year_end - has_total_pay - has_total_tax field_normalization: currency: patterns: gbp: ["£", "GBP", "pounds?", "sterling"] eur: ["€", "EUR", "euros?"] usd: ["$", "USD", "dollars?"] default: "GBP" date_formats: - "%d/%m/%Y" - "%d-%m-%Y" - "%d %B %Y" - "%d %b %Y" - "%Y-%m-%d" employer_names: canonical_mapping: "hmrc": ["hm revenue & customs", "her majesty's revenue and customs", "hmrc"] "nhs": ["national health service", "nhs trust", "nhs foundation trust"] normalization_rules: - remove_legal_suffixes: ["ltd", "limited", "plc", "llp", "partnership"] - standardize_case: "title" - remove_extra_whitespace: true address_parsing: postcode_pattern: "^[A-Z]{1,2}\\d[A-Z\\d]?\\s*\\d[A-Z]{2}$" components: - house_number - street_name - locality - town - county - postcode line_item_mapping: sa102_employment: box_1_pay_from_employment: sources: ["payslip.gross_pay", "p60.total_pay"] aggregation: "sum" box_2_uk_tax_deducted: sources: ["payslip.tax_deducted", "p60.total_tax"] aggregation: "sum" sa103_self_employment: box_12_turnover: sources: ["invoice.total", "receipt.amount"] filters: ["income_type = 'business'"] aggregation: "sum" box_31_total_expenses: sources: ["receipt.amount", "invoice.amount"] filters: ["expense_type = 'business'", "allowable = true"] aggregation: "sum" sa105_property: box_20_property_income: sources: ["bank_statement.credit", "rental_statement.rent"] filters: ["description contains 'rent'"] aggregation: "sum" box_29_property_expenses: sources: ["invoice.amount", "receipt.amount"] filters: ["category in ['repairs', 'maintenance', 'insurance', 'letting_fees']"] aggregation: "sum" period_inference: uk_tax_year: start_month: 4 start_day: 6 boundary_logic: "6_april_to_5_april" basis_period_reform: effective_from: "2024-04-06" transition_rules: - "align_to_tax_year" - "overlap_relief" assignment_rules: employment_income: "payment_date" self_employment: "invoice_date_or_receipt_date" property_income: "due_date_or_receipt_date" dividends: "payment_date" interest: "credited_date" dedupe_rules: same_transaction: keys: ["payer_name_norm", "amount", "date"] tolerance: amount: 0.01 date_days: 2 merge_strategy: "prefer_bank_statement" same_invoice: keys: ["invoice_number", "supplier_name_norm"] tolerance: amount: 0.01 merge_strategy: "prefer_original_document" confidence_model: source_priors: bank_statement: 0.95 official_certificate: 0.90 p60: 0.90 payslip: 0.85 invoice: 0.80 receipt: 0.75 prior_return: 0.70 manual_entry: 0.60 ocr_thresholds: high_confidence: 0.95 medium_confidence: 0.85 low_confidence: 0.70 reject_threshold: 0.50 ensemble_weights: ocr_confidence: 0.4 source_type: 0.3 field_validation: 0.2 cross_reference: 0.1 calibrated_confidence: method: "platt_scaling" calibration_data: "validation_set_predictions" bins: 10 conflict_resolution: precedence_matrix: amount_conflicts: 1: "bank_statement" 2: "official_certificate" 3: "invoice" 4: "receipt" 5: "manual_entry" date_conflicts: 1: "bank_statement" 2: "invoice" 3: "receipt" 4: "manual_entry" party_name_conflicts: 1: "official_certificate" 2: "bank_statement" 3: "invoice" 4: "manual_entry" escalation_criteria: amount_difference_threshold: 10.00 confidence_gap_threshold: 0.3 multiple_high_confidence_sources: true validation_rules: utr_checksum: true ni_number_regex: "^[A-CEGHJ-PR-TW-Z]{2}\\d{6}[A-D]$" iban_check: true vat_gb_mod97: true rounding_policy: "HMRC" # options: bankers|away_from_zero|HMRC numeric_tolerance: 0.01 field_validations: sort_code: "^\\d{2}-\\d{2}-\\d{2}$" account_number: "^\\d{8}$" postcode: "^[A-Z]{1,2}\\d[A-Z\\d]?\\s*\\d[A-Z]{2}$" email: "^[\\w\\.-]+@[\\w\\.-]+\\.[a-zA-Z]{2,}$" phone: "^(\\+44|0)[1-9]\\d{8,9}$" entity_resolution: blocking_keys: - payer_name_norm - sort_code_last4 - postcode - vat_number fuzzy_thresholds: name: 0.88 address: 0.85 phone: 0.90 email: 0.95 canonical_source_priority: - bank_statement - official_certificate - prior_return - manual_entry matching_algorithms: name: "jaro_winkler" address: "levenshtein" postcode: "exact" privacy_redaction: pii_fields: - ni_number - utr - iban - sort_code - account_number - phone - email - full_address masking_rules: mask_except_last4: ["ni_number", "utr", "iban", "sort_code", "phone"] mask_except_domain: ["email"] mask_house_number: ["address"] log_sanitization: remove_fields: ["extracted_text", "ocr_raw_output"] hash_fields: ["text_hash", "doc_checksum"] jurisdiction_overrides: uk_2023_24: personal_allowance: 12570 basic_rate_threshold: 37700 higher_rate_threshold: 125140 dividend_allowance: 1000 savings_allowance_basic: 1000 savings_allowance_higher: 500 uk_2024_25: personal_allowance: 12570 basic_rate_threshold: 37700 higher_rate_threshold: 125140 dividend_allowance: 500 savings_allowance_basic: 1000 savings_allowance_higher: 500