Files
ai-tax-agent/config/heuristics.yaml
harkon b324ff09ef
Some checks failed
CI/CD Pipeline / Code Quality & Linting (push) Has been cancelled
CI/CD Pipeline / Policy Validation (push) Has been cancelled
CI/CD Pipeline / Test Suite (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-firm-connectors) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-forms) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-hmrc) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ingestion) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-normalize-map) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ocr) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-indexer) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-reason) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rpa) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (ui-review) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (ui-review) (push) Has been cancelled
CI/CD Pipeline / Generate SBOM (push) Has been cancelled
CI/CD Pipeline / Deploy to Staging (push) Has been cancelled
CI/CD Pipeline / Deploy to Production (push) Has been cancelled
CI/CD Pipeline / Notifications (push) Has been cancelled
Initial commit
2025-10-11 08:41:36 +01:00

282 lines
6.4 KiB
YAML

# FILE: config/heuristics.yaml
document_kinds:
bank_statement:
patterns:
- "statement of account"
- "current account"
- "savings account"
- "sort code: \\d{2}-\\d{2}-\\d{2}"
classifiers:
- has_sort_code_pattern
- has_account_number
- has_transaction_table
invoice:
patterns:
- "invoice"
- "tax invoice"
- "vat invoice"
- "invoice number"
classifiers:
- has_vat_number
- has_invoice_number
- has_line_items
receipt:
patterns:
- "receipt"
- "till receipt"
- "card payment"
classifiers:
- has_merchant_name
- has_payment_method
payslip:
patterns:
- "payslip"
- "pay advice"
- "salary statement"
- "paye"
classifiers:
- has_employer_name
- has_ni_contributions
- has_tax_code
p60:
patterns:
- "p60"
- "end of year certificate"
classifiers:
- has_tax_year_end
- has_total_pay
- has_total_tax
field_normalization:
currency:
patterns:
gbp: ["£", "GBP", "pounds?", "sterling"]
eur: ["€", "EUR", "euros?"]
usd: ["$", "USD", "dollars?"]
default: "GBP"
date_formats:
- "%d/%m/%Y"
- "%d-%m-%Y"
- "%d %B %Y"
- "%d %b %Y"
- "%Y-%m-%d"
employer_names:
canonical_mapping:
"hmrc":
["hm revenue & customs", "her majesty's revenue and customs", "hmrc"]
"nhs": ["national health service", "nhs trust", "nhs foundation trust"]
normalization_rules:
- remove_legal_suffixes: ["ltd", "limited", "plc", "llp", "partnership"]
- standardize_case: "title"
- remove_extra_whitespace: true
address_parsing:
postcode_pattern: "^[A-Z]{1,2}\\d[A-Z\\d]?\\s*\\d[A-Z]{2}$"
components:
- house_number
- street_name
- locality
- town
- county
- postcode
line_item_mapping:
sa102_employment:
box_1_pay_from_employment:
sources: ["payslip.gross_pay", "p60.total_pay"]
aggregation: "sum"
box_2_uk_tax_deducted:
sources: ["payslip.tax_deducted", "p60.total_tax"]
aggregation: "sum"
sa103_self_employment:
box_12_turnover:
sources: ["invoice.total", "receipt.amount"]
filters: ["income_type = 'business'"]
aggregation: "sum"
box_31_total_expenses:
sources: ["receipt.amount", "invoice.amount"]
filters: ["expense_type = 'business'", "allowable = true"]
aggregation: "sum"
sa105_property:
box_20_property_income:
sources: ["bank_statement.credit", "rental_statement.rent"]
filters: ["description contains 'rent'"]
aggregation: "sum"
box_29_property_expenses:
sources: ["invoice.amount", "receipt.amount"]
filters:
["category in ['repairs', 'maintenance', 'insurance', 'letting_fees']"]
aggregation: "sum"
period_inference:
uk_tax_year:
start_month: 4
start_day: 6
boundary_logic: "6_april_to_5_april"
basis_period_reform:
effective_from: "2024-04-06"
transition_rules:
- "align_to_tax_year"
- "overlap_relief"
assignment_rules:
employment_income: "payment_date"
self_employment: "invoice_date_or_receipt_date"
property_income: "due_date_or_receipt_date"
dividends: "payment_date"
interest: "credited_date"
dedupe_rules:
same_transaction:
keys: ["payer_name_norm", "amount", "date"]
tolerance:
amount: 0.01
date_days: 2
merge_strategy: "prefer_bank_statement"
same_invoice:
keys: ["invoice_number", "supplier_name_norm"]
tolerance:
amount: 0.01
merge_strategy: "prefer_original_document"
confidence_model:
source_priors:
bank_statement: 0.95
official_certificate: 0.90
p60: 0.90
payslip: 0.85
invoice: 0.80
receipt: 0.75
prior_return: 0.70
manual_entry: 0.60
ocr_thresholds:
high_confidence: 0.95
medium_confidence: 0.85
low_confidence: 0.70
reject_threshold: 0.50
ensemble_weights:
ocr_confidence: 0.4
source_type: 0.3
field_validation: 0.2
cross_reference: 0.1
calibrated_confidence:
method: "platt_scaling"
calibration_data: "validation_set_predictions"
bins: 10
conflict_resolution:
precedence_matrix:
amount_conflicts:
1: "bank_statement"
2: "official_certificate"
3: "invoice"
4: "receipt"
5: "manual_entry"
date_conflicts:
1: "bank_statement"
2: "invoice"
3: "receipt"
4: "manual_entry"
party_name_conflicts:
1: "official_certificate"
2: "bank_statement"
3: "invoice"
4: "manual_entry"
escalation_criteria:
amount_difference_threshold: 10.00
confidence_gap_threshold: 0.3
multiple_high_confidence_sources: true
validation_rules:
utr_checksum: true
ni_number_regex: "^[A-CEGHJ-PR-TW-Z]{2}\\d{6}[A-D]$"
iban_check: true
vat_gb_mod97: true
rounding_policy: "HMRC" # options: bankers|away_from_zero|HMRC
numeric_tolerance: 0.01
field_validations:
sort_code: "^\\d{2}-\\d{2}-\\d{2}$"
account_number: "^\\d{8}$"
postcode: "^[A-Z]{1,2}\\d[A-Z\\d]?\\s*\\d[A-Z]{2}$"
email: "^[\\w\\.-]+@[\\w\\.-]+\\.[a-zA-Z]{2,}$"
phone: "^(\\+44|0)[1-9]\\d{8,9}$"
entity_resolution:
blocking_keys:
- payer_name_norm
- sort_code_last4
- postcode
- vat_number
fuzzy_thresholds:
name: 0.88
address: 0.85
phone: 0.90
email: 0.95
canonical_source_priority:
- bank_statement
- official_certificate
- prior_return
- manual_entry
matching_algorithms:
name: "jaro_winkler"
address: "levenshtein"
postcode: "exact"
privacy_redaction:
pii_fields:
- ni_number
- utr
- iban
- sort_code
- account_number
- phone
- email
- full_address
masking_rules:
mask_except_last4: ["ni_number", "utr", "iban", "sort_code", "phone"]
mask_except_domain: ["email"]
mask_house_number: ["address"]
log_sanitization:
remove_fields: ["extracted_text", "ocr_raw_output"]
hash_fields: ["text_hash", "doc_checksum"]
jurisdiction_overrides:
uk_2023_24:
personal_allowance: 12570
basic_rate_threshold: 37700
higher_rate_threshold: 125140
dividend_allowance: 1000
savings_allowance_basic: 1000
savings_allowance_higher: 500
uk_2024_25:
personal_allowance: 12570
basic_rate_threshold: 37700
higher_rate_threshold: 125140
dividend_allowance: 500
savings_allowance_basic: 1000
savings_allowance_higher: 500