Some checks failed
CI/CD Pipeline / Code Quality & Linting (push) Has been cancelled
CI/CD Pipeline / Policy Validation (push) Has been cancelled
CI/CD Pipeline / Test Suite (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-firm-connectors) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-forms) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-hmrc) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ingestion) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-normalize-map) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ocr) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-indexer) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-reason) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rpa) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (ui-review) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (ui-review) (push) Has been cancelled
CI/CD Pipeline / Generate SBOM (push) Has been cancelled
CI/CD Pipeline / Deploy to Staging (push) Has been cancelled
CI/CD Pipeline / Deploy to Production (push) Has been cancelled
CI/CD Pipeline / Notifications (push) Has been cancelled
282 lines
6.4 KiB
YAML
282 lines
6.4 KiB
YAML
# FILE: config/heuristics.yaml
|
|
|
|
document_kinds:
|
|
bank_statement:
|
|
patterns:
|
|
- "statement of account"
|
|
- "current account"
|
|
- "savings account"
|
|
- "sort code: \\d{2}-\\d{2}-\\d{2}"
|
|
classifiers:
|
|
- has_sort_code_pattern
|
|
- has_account_number
|
|
- has_transaction_table
|
|
|
|
invoice:
|
|
patterns:
|
|
- "invoice"
|
|
- "tax invoice"
|
|
- "vat invoice"
|
|
- "invoice number"
|
|
classifiers:
|
|
- has_vat_number
|
|
- has_invoice_number
|
|
- has_line_items
|
|
|
|
receipt:
|
|
patterns:
|
|
- "receipt"
|
|
- "till receipt"
|
|
- "card payment"
|
|
classifiers:
|
|
- has_merchant_name
|
|
- has_payment_method
|
|
|
|
payslip:
|
|
patterns:
|
|
- "payslip"
|
|
- "pay advice"
|
|
- "salary statement"
|
|
- "paye"
|
|
classifiers:
|
|
- has_employer_name
|
|
- has_ni_contributions
|
|
- has_tax_code
|
|
|
|
p60:
|
|
patterns:
|
|
- "p60"
|
|
- "end of year certificate"
|
|
classifiers:
|
|
- has_tax_year_end
|
|
- has_total_pay
|
|
- has_total_tax
|
|
|
|
field_normalization:
|
|
currency:
|
|
patterns:
|
|
gbp: ["£", "GBP", "pounds?", "sterling"]
|
|
eur: ["€", "EUR", "euros?"]
|
|
usd: ["$", "USD", "dollars?"]
|
|
default: "GBP"
|
|
|
|
date_formats:
|
|
- "%d/%m/%Y"
|
|
- "%d-%m-%Y"
|
|
- "%d %B %Y"
|
|
- "%d %b %Y"
|
|
- "%Y-%m-%d"
|
|
|
|
employer_names:
|
|
canonical_mapping:
|
|
"hmrc":
|
|
["hm revenue & customs", "her majesty's revenue and customs", "hmrc"]
|
|
"nhs": ["national health service", "nhs trust", "nhs foundation trust"]
|
|
normalization_rules:
|
|
- remove_legal_suffixes: ["ltd", "limited", "plc", "llp", "partnership"]
|
|
- standardize_case: "title"
|
|
- remove_extra_whitespace: true
|
|
|
|
address_parsing:
|
|
postcode_pattern: "^[A-Z]{1,2}\\d[A-Z\\d]?\\s*\\d[A-Z]{2}$"
|
|
components:
|
|
- house_number
|
|
- street_name
|
|
- locality
|
|
- town
|
|
- county
|
|
- postcode
|
|
|
|
line_item_mapping:
|
|
sa102_employment:
|
|
box_1_pay_from_employment:
|
|
sources: ["payslip.gross_pay", "p60.total_pay"]
|
|
aggregation: "sum"
|
|
box_2_uk_tax_deducted:
|
|
sources: ["payslip.tax_deducted", "p60.total_tax"]
|
|
aggregation: "sum"
|
|
|
|
sa103_self_employment:
|
|
box_12_turnover:
|
|
sources: ["invoice.total", "receipt.amount"]
|
|
filters: ["income_type = 'business'"]
|
|
aggregation: "sum"
|
|
box_31_total_expenses:
|
|
sources: ["receipt.amount", "invoice.amount"]
|
|
filters: ["expense_type = 'business'", "allowable = true"]
|
|
aggregation: "sum"
|
|
|
|
sa105_property:
|
|
box_20_property_income:
|
|
sources: ["bank_statement.credit", "rental_statement.rent"]
|
|
filters: ["description contains 'rent'"]
|
|
aggregation: "sum"
|
|
box_29_property_expenses:
|
|
sources: ["invoice.amount", "receipt.amount"]
|
|
filters:
|
|
["category in ['repairs', 'maintenance', 'insurance', 'letting_fees']"]
|
|
aggregation: "sum"
|
|
|
|
period_inference:
|
|
uk_tax_year:
|
|
start_month: 4
|
|
start_day: 6
|
|
boundary_logic: "6_april_to_5_april"
|
|
|
|
basis_period_reform:
|
|
effective_from: "2024-04-06"
|
|
transition_rules:
|
|
- "align_to_tax_year"
|
|
- "overlap_relief"
|
|
|
|
assignment_rules:
|
|
employment_income: "payment_date"
|
|
self_employment: "invoice_date_or_receipt_date"
|
|
property_income: "due_date_or_receipt_date"
|
|
dividends: "payment_date"
|
|
interest: "credited_date"
|
|
|
|
dedupe_rules:
|
|
same_transaction:
|
|
keys: ["payer_name_norm", "amount", "date"]
|
|
tolerance:
|
|
amount: 0.01
|
|
date_days: 2
|
|
merge_strategy: "prefer_bank_statement"
|
|
|
|
same_invoice:
|
|
keys: ["invoice_number", "supplier_name_norm"]
|
|
tolerance:
|
|
amount: 0.01
|
|
merge_strategy: "prefer_original_document"
|
|
|
|
confidence_model:
|
|
source_priors:
|
|
bank_statement: 0.95
|
|
official_certificate: 0.90
|
|
p60: 0.90
|
|
payslip: 0.85
|
|
invoice: 0.80
|
|
receipt: 0.75
|
|
prior_return: 0.70
|
|
manual_entry: 0.60
|
|
|
|
ocr_thresholds:
|
|
high_confidence: 0.95
|
|
medium_confidence: 0.85
|
|
low_confidence: 0.70
|
|
reject_threshold: 0.50
|
|
|
|
ensemble_weights:
|
|
ocr_confidence: 0.4
|
|
source_type: 0.3
|
|
field_validation: 0.2
|
|
cross_reference: 0.1
|
|
|
|
calibrated_confidence:
|
|
method: "platt_scaling"
|
|
calibration_data: "validation_set_predictions"
|
|
bins: 10
|
|
|
|
conflict_resolution:
|
|
precedence_matrix:
|
|
amount_conflicts:
|
|
1: "bank_statement"
|
|
2: "official_certificate"
|
|
3: "invoice"
|
|
4: "receipt"
|
|
5: "manual_entry"
|
|
|
|
date_conflicts:
|
|
1: "bank_statement"
|
|
2: "invoice"
|
|
3: "receipt"
|
|
4: "manual_entry"
|
|
|
|
party_name_conflicts:
|
|
1: "official_certificate"
|
|
2: "bank_statement"
|
|
3: "invoice"
|
|
4: "manual_entry"
|
|
|
|
escalation_criteria:
|
|
amount_difference_threshold: 10.00
|
|
confidence_gap_threshold: 0.3
|
|
multiple_high_confidence_sources: true
|
|
|
|
validation_rules:
|
|
utr_checksum: true
|
|
ni_number_regex: "^[A-CEGHJ-PR-TW-Z]{2}\\d{6}[A-D]$"
|
|
iban_check: true
|
|
vat_gb_mod97: true
|
|
rounding_policy: "HMRC" # options: bankers|away_from_zero|HMRC
|
|
numeric_tolerance: 0.01
|
|
|
|
field_validations:
|
|
sort_code: "^\\d{2}-\\d{2}-\\d{2}$"
|
|
account_number: "^\\d{8}$"
|
|
postcode: "^[A-Z]{1,2}\\d[A-Z\\d]?\\s*\\d[A-Z]{2}$"
|
|
email: "^[\\w\\.-]+@[\\w\\.-]+\\.[a-zA-Z]{2,}$"
|
|
phone: "^(\\+44|0)[1-9]\\d{8,9}$"
|
|
|
|
entity_resolution:
|
|
blocking_keys:
|
|
- payer_name_norm
|
|
- sort_code_last4
|
|
- postcode
|
|
- vat_number
|
|
|
|
fuzzy_thresholds:
|
|
name: 0.88
|
|
address: 0.85
|
|
phone: 0.90
|
|
email: 0.95
|
|
|
|
canonical_source_priority:
|
|
- bank_statement
|
|
- official_certificate
|
|
- prior_return
|
|
- manual_entry
|
|
|
|
matching_algorithms:
|
|
name: "jaro_winkler"
|
|
address: "levenshtein"
|
|
postcode: "exact"
|
|
|
|
privacy_redaction:
|
|
pii_fields:
|
|
- ni_number
|
|
- utr
|
|
- iban
|
|
- sort_code
|
|
- account_number
|
|
- phone
|
|
- email
|
|
- full_address
|
|
|
|
masking_rules:
|
|
mask_except_last4: ["ni_number", "utr", "iban", "sort_code", "phone"]
|
|
mask_except_domain: ["email"]
|
|
mask_house_number: ["address"]
|
|
|
|
log_sanitization:
|
|
remove_fields: ["extracted_text", "ocr_raw_output"]
|
|
hash_fields: ["text_hash", "doc_checksum"]
|
|
|
|
jurisdiction_overrides:
|
|
uk_2023_24:
|
|
personal_allowance: 12570
|
|
basic_rate_threshold: 37700
|
|
higher_rate_threshold: 125140
|
|
dividend_allowance: 1000
|
|
savings_allowance_basic: 1000
|
|
savings_allowance_higher: 500
|
|
|
|
uk_2024_25:
|
|
personal_allowance: 12570
|
|
basic_rate_threshold: 37700
|
|
higher_rate_threshold: 125140
|
|
dividend_allowance: 500
|
|
savings_allowance_basic: 1000
|
|
savings_allowance_higher: 500
|