Files
ai-tax-agent/libs/rag/pii_detector.py
harkon b324ff09ef
Some checks failed
CI/CD Pipeline / Code Quality & Linting (push) Has been cancelled
CI/CD Pipeline / Policy Validation (push) Has been cancelled
CI/CD Pipeline / Test Suite (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-firm-connectors) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-forms) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-hmrc) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ingestion) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-normalize-map) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ocr) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-indexer) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-reason) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rpa) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (ui-review) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (ui-review) (push) Has been cancelled
CI/CD Pipeline / Generate SBOM (push) Has been cancelled
CI/CD Pipeline / Deploy to Staging (push) Has been cancelled
CI/CD Pipeline / Deploy to Production (push) Has been cancelled
CI/CD Pipeline / Notifications (push) Has been cancelled
Initial commit
2025-10-11 08:41:36 +01:00

78 lines
2.7 KiB
Python

"""PII detection and de-identification utilities."""
import hashlib
import re
from typing import Any
class PIIDetector:
"""PII detection and de-identification utilities"""
# Regex patterns for common PII
PII_PATTERNS = {
"uk_ni_number": r"\b[A-CEGHJ-PR-TW-Z]{2}\d{6}[A-D]\b",
"uk_utr": r"\b\d{10}\b",
"uk_postcode": r"\b[A-Z]{1,2}\d[A-Z0-9]?\s*\d[A-Z]{2}\b",
"uk_sort_code": r"\b\d{2}-\d{2}-\d{2}\b",
"uk_account_number": r"\b\d{8}\b",
"email": r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b",
"phone": r"\b(?:\+44|0)\d{10,11}\b",
"iban": r"\bGB\d{2}[A-Z]{4}\d{14}\b",
"amount": r"£\d{1,3}(?:,\d{3})*(?:\.\d{2})?",
"date": r"\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b",
}
def __init__(self) -> None:
self.compiled_patterns = {
name: re.compile(pattern, re.IGNORECASE)
for name, pattern in self.PII_PATTERNS.items()
}
def detect_pii(self, text: str) -> list[dict[str, Any]]:
"""Detect PII in text and return matches with positions"""
matches = []
for pii_type, pattern in self.compiled_patterns.items():
for match in pattern.finditer(text):
matches.append(
{
"type": pii_type,
"value": match.group(),
"start": match.start(),
"end": match.end(),
"placeholder": self._generate_placeholder(
pii_type, match.group()
),
}
)
return sorted(matches, key=lambda x: x["start"])
def de_identify_text(self, text: str) -> tuple[str, dict[str, str]]:
"""De-identify text by replacing PII with placeholders"""
pii_matches = self.detect_pii(text)
pii_mapping = {}
# Replace PII from end to start to maintain positions
de_identified = text
for match in reversed(pii_matches):
placeholder = match["placeholder"]
pii_mapping[placeholder] = match["value"]
de_identified = (
de_identified[: match["start"]]
+ placeholder
+ de_identified[match["end"] :]
)
return de_identified, pii_mapping
def _generate_placeholder(self, pii_type: str, value: str) -> str:
"""Generate consistent placeholder for PII value"""
# Create hash of the value for consistent placeholders
value_hash = hashlib.md5(value.encode()).hexdigest()[:8]
return f"[{pii_type.upper()}_{value_hash}]"
def has_pii(self, text: str) -> bool:
"""Check if text contains any PII"""
return len(self.detect_pii(text)) > 0