Initial commit
Some checks failed
CI/CD Pipeline / Code Quality & Linting (push) Has been cancelled
CI/CD Pipeline / Policy Validation (push) Has been cancelled
CI/CD Pipeline / Test Suite (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-firm-connectors) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-forms) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-hmrc) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ingestion) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-normalize-map) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ocr) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-indexer) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-reason) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rpa) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (ui-review) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (ui-review) (push) Has been cancelled
CI/CD Pipeline / Generate SBOM (push) Has been cancelled
CI/CD Pipeline / Deploy to Staging (push) Has been cancelled
CI/CD Pipeline / Deploy to Production (push) Has been cancelled
CI/CD Pipeline / Notifications (push) Has been cancelled

This commit is contained in:
harkon
2025-10-11 08:41:36 +01:00
commit b324ff09ef
276 changed files with 55220 additions and 0 deletions

405
config/coverage.yaml Normal file
View File

@@ -0,0 +1,405 @@
# FILE: config/coverage.yaml
version: "1.0"
jurisdiction: "UK"
tax_year: "2024-25"
tax_year_boundary:
start: "2024-04-06"
end: "2025-04-05"
defaults:
confidence_thresholds:
ocr: 0.82
extract: 0.85
date_tolerance_days: 30
require_lineage_bbox: true
allow_bank_substantiation: true # when primary statement missing, allow verified bank YTD + reconciliation
document_kinds:
# canonical kinds used by extractor/classifier (map your classifier labels to these)
- P60
- P45
- P11D
- PayslipMonthly
- FinalPayslipYTD
- EmploymentContract
- AccountsPAndL
- AccountsBalanceSheet
- CapitalAllowancesSchedule
- MileageLog
- LettingAgentStatements
- TenancyLedger
- MortgageInterestCertificate
- OwnershipShareProof
- OccupancyLog
- BookingsCalendar
- BankStatements
- BuildingSocietyInterestCert
- BankInterestAnnualStatement
- DividendVouchers
- ConsolidatedTaxVoucher
- SLCAnnualStatement
- PensionContributionStatement
- GiftAidStatement
- ForeignIncomeStatement
- OverseasTaxCreditStatement
- TrustDistributionStatement
- EstateR185
- CGT_BrokerAnnualReport
- CGT_Computation
- RemittanceBasisWorkpaper
- ResidenceEvidence
- HMRC_CodingNotice
- HMRC_PaymentOnAccount
- OtherSupportingDoc
guidance_refs:
# Handy lookup keys used by AskClarifyingQuestion; keep them high-level & stable
SA100_Notes_2025: { doc_id: "SA150-Notes-2025", kind: "Notes" }
SA102_Notes_2025: { doc_id: "SA102-Notes-2025", kind: "Notes" }
SA103S_Notes_2025: { doc_id: "SA103S-Notes-2025", kind: "Notes" }
SA103F_Notes_2025: { doc_id: "SA103F-Notes-2025", kind: "Notes" }
SA105_Notes_2025: { doc_id: "SA105-Notes-2025", kind: "Notes" }
SA106_Notes_2025: { doc_id: "SA106-Notes-2025", kind: "Notes" }
SA107_Notes_2025: { doc_id: "SA107-Notes-2025", kind: "Notes" }
SA108_Notes_2025: { doc_id: "SA108-Notes-2025", kind: "Notes" }
SA109_Notes_2025: { doc_id: "SA109-Notes-2025", kind: "Notes" }
SA110_Notes_2025: { doc_id: "SA110-Notes-2025", kind: "Notes" }
triggers:
# Evaluate against KG & intake flags to decide which schedules apply
SA102:
any_of:
- exists: IncomeItem[type="Employment"]
- taxpayer_flag: has_employment
SA103S:
any_of:
- exists: IncomeItem[type="SelfEmployment" AND turnover_lt_vat_threshold=true]
- taxpayer_flag: is_self_employed_short
SA103F:
any_of:
- exists: IncomeItem[type="SelfEmployment" AND turnover_ge_vat_threshold=true]
- taxpayer_flag: is_self_employed_full
SA105:
any_of:
- exists: IncomeItem[type="UKPropertyRent"]
- taxpayer_flag: has_property_income
SA106:
any_of:
- exists: IncomeItem[type IN ["ForeignInterest","ForeignDividends","ForeignEmployment","EEA_FHL","OverseasProperty"]]
- taxpayer_flag: has_foreign_income
SA107:
any_of:
- exists: TrustDistribution
- exists: EstateIncome
- taxpayer_flag: has_trust_or_estate_income
SA108:
any_of:
- exists: CapitalGain
- taxpayer_flag: has_disposals
SA109:
any_of:
- taxpayer_flag: claims_remittance_basis
- exists: NonUKResident
SA110:
any_of:
- filing_mode: paper
- taxpayer_flag: wants_manual_calculation
schedules:
SA102: # Employment
guidance_hint: SA102_Notes_2025
evidence:
- id: P60
role: REQUIRED
boxes: ["SA102_b1", "SA102_b2"] # pay and UK tax taken off
acceptable_alternatives: ["P45", "FinalPayslipYTD"]
validity:
within_tax_year: true
reasons:
short: "P60 (or P45/final payslip) provides year-to-date pay and PAYE tax figures for boxes 12."
- id: P11D
role: CONDITIONALLY_REQUIRED
condition: exists(BenefitInKind=true)
boxes:
[
"SA102_b9",
"SA102_b10",
"SA102_b11",
"SA102_b12",
"SA102_b13",
"SA102_b14",
"SA102_b15",
"SA102_b16",
"SA102_b17",
"SA102_b18",
"SA102_b19",
"SA102_b20",
]
acceptable_alternatives: ["EmployerStatement"]
validity:
available_by: "2025-07-06"
reasons:
short: "P11D carries benefits/expenses that map to boxes 920 when not payrolled."
- id: SLCAnnualStatement
role: OPTIONAL
boxes: ["SA102_b21", "SA102_b21_1"]
reasons:
short: "Student/Postgrad loan indicators and plan types where applicable."
- id: PayslipMonthly
role: OPTIONAL
boxes: ["SA102_b3"] # tips/other payments not on P60
acceptable_alternatives: []
- id: EmploymentContract
role: OPTIONAL
boxes: []
reasons:
short: "Used only for disambiguation (OFF-PAYROLL/IR35, director)."
cross_checks:
- name: "PAYE Reconcile"
logic: "Sum(payrolled_BIKs_excluded_from_SLR) handled; P60 box totals = SA102_b1; PAYE tax = SA102_b2 within ±£1."
SA103S: # Self-employment (short)
guidance_hint: SA103S_Notes_2025
evidence:
- id: AccountsPAndL
role: REQUIRED
boxes: ["SA103S_b9", "SA103S_b15", "SA103S_b28"]
reasons:
short: "Turnover and allowable expenses supporting net profit figures."
- id: BankStatements
role: REQUIRED
boxes: ["SA103S_b9", "SA103S_b11", "SA103S_b17"]
reasons:
short: "Bank corroboration of takings/expenses (cash basis or traditional)."
- id: CapitalAllowancesSchedule
role: CONDITIONALLY_REQUIRED
condition: exists(ExpenseItem[category='CapitalAllowances'])
boxes: ["SA103S_b49"]
- id: MileageLog
role: OPTIONAL
boxes: ["SA103S_b20"]
- id: HMRC_CodingNotice
role: OPTIONAL
boxes: []
reasons:
short: "Basis period changes or coding interactions."
selection_rule:
prefer_short_if: "turnover < VAT_threshold AND no_complex_adjustments"
else_use: "SA103F"
SA103F: # Self-employment (full)
guidance_hint: SA103F_Notes_2025
evidence:
- id: AccountsPAndL
role: REQUIRED
boxes: ["SA103F_b15", "SA103F_b31", "SA103F_b73"]
- id: AccountsBalanceSheet
role: REQUIRED
boxes: []
- id: BankStatements
role: REQUIRED
boxes: ["SA103F_b15", "SA103F_b31"]
- id: CapitalAllowancesSchedule
role: CONDITIONALLY_REQUIRED
condition: exists(ExpenseItem[category='CapitalAllowances'])
boxes: ["SA103F_b50", "SA103F_b52", "SA103F_b55", "SA103F_b57"]
- id: MileageLog
role: OPTIONAL
boxes: ["SA103F_b20"]
notes:
long_form_needed_if:
- "turnover >= VAT_threshold"
- "claims overlap adjustments, averaging, or multiple trades"
SA105: # UK Property (incl. UK FHL)
guidance_hint: SA105_Notes_2025
evidence:
- id: LettingAgentStatements
role: REQUIRED
boxes: ["SA105_b5", "SA105_b20", "SA105_b29"] # income and totals; totals vs. sum of expenses
acceptable_alternatives: ["TenancyLedger", "BankStatements"]
reasons:
short: "Gross rents, fees and charges per-year by property/portfolio."
- id: MortgageInterestCertificate
role: CONDITIONALLY_REQUIRED
condition: exists(ExpenseItem[category='FinanceCosts'])
boxes: ["SA105_b44"] # feeds SA110 basic-rate credit
- id: OwnershipShareProof
role: CONDITIONALLY_REQUIRED
condition: property_joint_ownership=true
boxes: ["SA105_b3"]
- id: OccupancyLog
role: CONDITIONALLY_REQUIRED
condition: candidate_FHL=true
boxes: ["SA105_b5", "SA105_b20"]
acceptable_alternatives: ["BookingsCalendar"]
- id: BankStatements
role: OPTIONAL
boxes: ["SA105_b20", "SA105_b29"]
cross_checks:
- name: "Property Income Allowance Gate"
logic: "If SA105_b20.1 claimed then no expense boxes 2429 or FHL expense boxes 612 allowed."
SA106: # Foreign
guidance_hint: SA106_Notes_2025
evidence:
- id: ForeignIncomeStatement
role: REQUIRED
boxes: ["SA106_b1", "SA106_b2", "SA106_b3", "SA106_b5"]
reasons:
short: "Dividends/interest/overseas employment; gross and tax paid."
- id: OverseasTaxCreditStatement
role: CONDITIONALLY_REQUIRED
condition: claims_FTCR=true
boxes: ["SA106_b2", "SA106_b5"]
- id: EEA_FHL_OccupancyLog
role: CONDITIONALLY_REQUIRED
condition: exists(IncomeItem[type='EEA_FHL'])
boxes: ["SA106_b14", "SA106_b15"]
- id: BankStatements
role: OPTIONAL
boxes: ["SA106_b1", "SA106_b3"]
notes:
remittance_interaction: "If remittance basis claimed, mirror to SA109."
SA107: # Trusts etc
guidance_hint: SA107_Notes_2025
evidence:
- id: TrustDistributionStatement
role: REQUIRED
boxes: ["SA107_b1", "SA107_b2", "SA107_b3"]
- id: EstateR185
role: CONDITIONALLY_REQUIRED
condition: received_estate_income=true
boxes: ["SA107_b9", "SA107_b10"]
- id: BankStatements
role: OPTIONAL
boxes: []
SA108: # Capital Gains
guidance_hint: SA108_Notes_2025
evidence:
- id: CGT_BrokerAnnualReport
role: REQUIRED
boxes:
[
"SA108_b4",
"SA108_b5",
"SA108_b6",
"SA108_b9",
"SA108_b11",
"SA108_b14",
]
reasons:
short: "Disposals, proceeds, allowable costs, gain breakdowns (residential vs other)."
- id: CGT_Computation
role: REQUIRED
boxes: ["SA108_b28", "SA108_b34"]
- id: BankStatements
role: OPTIONAL
boxes: ["SA108_b4", "SA108_b5"]
special_2024_25:
adjustment_note: "Rate change adjustment for disposals on/after 2024-10-30 may be required."
SA109: # Residence / Remittance
guidance_hint: SA109_Notes_2025
evidence:
- id: ResidenceEvidence
role: REQUIRED
boxes: ["SA109_b1", "SA109_b7", "SA109_b8", "SA109_b9"]
- id: RemittanceBasisWorkpaper
role: CONDITIONALLY_REQUIRED
condition: claims_remittance_basis=true
boxes: ["SA109_b28", "SA109_b39"]
- id: ForeignIncomeStatement
role: OPTIONAL
boxes: ["SA109_b28", "SA109_b39"]
SA110: # Tax calculation summary (paper/manual)
guidance_hint: SA110_Notes_2025
evidence:
- id: HMRC_PaymentOnAccount
role: OPTIONAL
boxes: ["SA110_b10", "SA110_b11"]
- id: HMRC_CodingNotice
role: OPTIONAL
boxes: ["SA110_b7", "SA110_b8", "SA110_b9"]
notes:
online_filing: "If online, SA110 is computed automatically; still store calculation lineage for audit."
SA100: # Core return - savings/dividends/gift aid, etc.
guidance_hint: SA100_Notes_2025
evidence:
- id: BankInterestAnnualStatement
role: CONDITIONALLY_REQUIRED
condition: exists(IncomeItem[type='SavingsInterest'])
boxes: ["SA100_b1"]
- id: DividendVouchers
role: CONDITIONALLY_REQUIRED
condition: exists(IncomeItem[type='Dividends'])
boxes: ["SA100_b2"]
acceptable_alternatives: ["ConsolidatedTaxVoucher"]
- id: PensionContributionStatement
role: CONDITIONALLY_REQUIRED
condition: exists(PensionContribution[relief_method='RAS'])
boxes: ["SA100_b4"]
- id: GiftAidStatement
role: OPTIONAL
boxes: ["SA100_b5"]
status_classifier:
# How we classify found evidence for coverage
present_verified:
min_ocr: 0.82
min_extract: 0.85
date_in_year: true
present_unverified:
min_ocr: 0.60
min_extract: 0.70
date_in_year_or_tolerance: true
conflicting:
conflict_rules:
- "Same doc kind, different totals for same period ±£1"
- "Totals disagree with KG aggregates by >£1"
missing:
default: true
conflict_resolution:
precedence:
[
"LettingAgentStatements",
"P60",
"P11D",
"ConsolidatedTaxVoucher",
"BankStatements",
"ManualEntry",
]
escalation:
to_review: true
reason_templates:
- "Document totals disagree with computed aggregates."
- "Low confidence OCR; request re-upload or alternative."
question_templates:
default:
text: "To complete the {schedule} for {tax_year}, we need {evidence}. These documents support boxes {boxes}. If you dont have this, you can provide {alternatives}."
why: "{why}. See guidance: {guidance_doc}."
reasons:
P60: "P60 provides your year-end pay and PAYE tax figures for the employment page."
P11D: "P11D lists benefits and expenses that map directly to boxes 920 when not payrolled."
LettingAgentStatements: "HMRC expects evidence of gross rents and expenses to support SA105 totals."
MortgageInterestCertificate: "Mortgage interest supports the basic-rate tax reduction computation."
CGT_BrokerAnnualReport: "Brokers annual summaries and computations substantiate proceeds, costs and gains."
privacy:
# Ensure we never index PII into vectors
vector_pii_free: true
redact_patterns:
- NI_Number
- UTR
- IBAN
- SortCode
- AccountNumber
- Email
- Phone

281
config/heuristics.yaml Normal file
View File

@@ -0,0 +1,281 @@
# FILE: config/heuristics.yaml
document_kinds:
bank_statement:
patterns:
- "statement of account"
- "current account"
- "savings account"
- "sort code: \\d{2}-\\d{2}-\\d{2}"
classifiers:
- has_sort_code_pattern
- has_account_number
- has_transaction_table
invoice:
patterns:
- "invoice"
- "tax invoice"
- "vat invoice"
- "invoice number"
classifiers:
- has_vat_number
- has_invoice_number
- has_line_items
receipt:
patterns:
- "receipt"
- "till receipt"
- "card payment"
classifiers:
- has_merchant_name
- has_payment_method
payslip:
patterns:
- "payslip"
- "pay advice"
- "salary statement"
- "paye"
classifiers:
- has_employer_name
- has_ni_contributions
- has_tax_code
p60:
patterns:
- "p60"
- "end of year certificate"
classifiers:
- has_tax_year_end
- has_total_pay
- has_total_tax
field_normalization:
currency:
patterns:
gbp: ["£", "GBP", "pounds?", "sterling"]
eur: ["€", "EUR", "euros?"]
usd: ["$", "USD", "dollars?"]
default: "GBP"
date_formats:
- "%d/%m/%Y"
- "%d-%m-%Y"
- "%d %B %Y"
- "%d %b %Y"
- "%Y-%m-%d"
employer_names:
canonical_mapping:
"hmrc":
["hm revenue & customs", "her majesty's revenue and customs", "hmrc"]
"nhs": ["national health service", "nhs trust", "nhs foundation trust"]
normalization_rules:
- remove_legal_suffixes: ["ltd", "limited", "plc", "llp", "partnership"]
- standardize_case: "title"
- remove_extra_whitespace: true
address_parsing:
postcode_pattern: "^[A-Z]{1,2}\\d[A-Z\\d]?\\s*\\d[A-Z]{2}$"
components:
- house_number
- street_name
- locality
- town
- county
- postcode
line_item_mapping:
sa102_employment:
box_1_pay_from_employment:
sources: ["payslip.gross_pay", "p60.total_pay"]
aggregation: "sum"
box_2_uk_tax_deducted:
sources: ["payslip.tax_deducted", "p60.total_tax"]
aggregation: "sum"
sa103_self_employment:
box_12_turnover:
sources: ["invoice.total", "receipt.amount"]
filters: ["income_type = 'business'"]
aggregation: "sum"
box_31_total_expenses:
sources: ["receipt.amount", "invoice.amount"]
filters: ["expense_type = 'business'", "allowable = true"]
aggregation: "sum"
sa105_property:
box_20_property_income:
sources: ["bank_statement.credit", "rental_statement.rent"]
filters: ["description contains 'rent'"]
aggregation: "sum"
box_29_property_expenses:
sources: ["invoice.amount", "receipt.amount"]
filters:
["category in ['repairs', 'maintenance', 'insurance', 'letting_fees']"]
aggregation: "sum"
period_inference:
uk_tax_year:
start_month: 4
start_day: 6
boundary_logic: "6_april_to_5_april"
basis_period_reform:
effective_from: "2024-04-06"
transition_rules:
- "align_to_tax_year"
- "overlap_relief"
assignment_rules:
employment_income: "payment_date"
self_employment: "invoice_date_or_receipt_date"
property_income: "due_date_or_receipt_date"
dividends: "payment_date"
interest: "credited_date"
dedupe_rules:
same_transaction:
keys: ["payer_name_norm", "amount", "date"]
tolerance:
amount: 0.01
date_days: 2
merge_strategy: "prefer_bank_statement"
same_invoice:
keys: ["invoice_number", "supplier_name_norm"]
tolerance:
amount: 0.01
merge_strategy: "prefer_original_document"
confidence_model:
source_priors:
bank_statement: 0.95
official_certificate: 0.90
p60: 0.90
payslip: 0.85
invoice: 0.80
receipt: 0.75
prior_return: 0.70
manual_entry: 0.60
ocr_thresholds:
high_confidence: 0.95
medium_confidence: 0.85
low_confidence: 0.70
reject_threshold: 0.50
ensemble_weights:
ocr_confidence: 0.4
source_type: 0.3
field_validation: 0.2
cross_reference: 0.1
calibrated_confidence:
method: "platt_scaling"
calibration_data: "validation_set_predictions"
bins: 10
conflict_resolution:
precedence_matrix:
amount_conflicts:
1: "bank_statement"
2: "official_certificate"
3: "invoice"
4: "receipt"
5: "manual_entry"
date_conflicts:
1: "bank_statement"
2: "invoice"
3: "receipt"
4: "manual_entry"
party_name_conflicts:
1: "official_certificate"
2: "bank_statement"
3: "invoice"
4: "manual_entry"
escalation_criteria:
amount_difference_threshold: 10.00
confidence_gap_threshold: 0.3
multiple_high_confidence_sources: true
validation_rules:
utr_checksum: true
ni_number_regex: "^[A-CEGHJ-PR-TW-Z]{2}\\d{6}[A-D]$"
iban_check: true
vat_gb_mod97: true
rounding_policy: "HMRC" # options: bankers|away_from_zero|HMRC
numeric_tolerance: 0.01
field_validations:
sort_code: "^\\d{2}-\\d{2}-\\d{2}$"
account_number: "^\\d{8}$"
postcode: "^[A-Z]{1,2}\\d[A-Z\\d]?\\s*\\d[A-Z]{2}$"
email: "^[\\w\\.-]+@[\\w\\.-]+\\.[a-zA-Z]{2,}$"
phone: "^(\\+44|0)[1-9]\\d{8,9}$"
entity_resolution:
blocking_keys:
- payer_name_norm
- sort_code_last4
- postcode
- vat_number
fuzzy_thresholds:
name: 0.88
address: 0.85
phone: 0.90
email: 0.95
canonical_source_priority:
- bank_statement
- official_certificate
- prior_return
- manual_entry
matching_algorithms:
name: "jaro_winkler"
address: "levenshtein"
postcode: "exact"
privacy_redaction:
pii_fields:
- ni_number
- utr
- iban
- sort_code
- account_number
- phone
- email
- full_address
masking_rules:
mask_except_last4: ["ni_number", "utr", "iban", "sort_code", "phone"]
mask_except_domain: ["email"]
mask_house_number: ["address"]
log_sanitization:
remove_fields: ["extracted_text", "ocr_raw_output"]
hash_fields: ["text_hash", "doc_checksum"]
jurisdiction_overrides:
uk_2023_24:
personal_allowance: 12570
basic_rate_threshold: 37700
higher_rate_threshold: 125140
dividend_allowance: 1000
savings_allowance_basic: 1000
savings_allowance_higher: 500
uk_2024_25:
personal_allowance: 12570
basic_rate_threshold: 37700
higher_rate_threshold: 125140
dividend_allowance: 500
savings_allowance_basic: 1000
savings_allowance_higher: 500