"""PII detection and de-identification utilities.""" import hashlib import re from typing import Any class PIIDetector: """PII detection and de-identification utilities""" # Regex patterns for common PII PII_PATTERNS = { "uk_ni_number": r"\b[A-CEGHJ-PR-TW-Z]{2}\d{6}[A-D]\b", "uk_utr": r"\b\d{10}\b", "uk_postcode": r"\b[A-Z]{1,2}\d[A-Z0-9]?\s*\d[A-Z]{2}\b", "uk_sort_code": r"\b\d{2}-\d{2}-\d{2}\b", "uk_account_number": r"\b\d{8}\b", "email": r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b", "phone": r"\b(?:\+44|0)\d{10,11}\b", "iban": r"\bGB\d{2}[A-Z]{4}\d{14}\b", "amount": r"£\d{1,3}(?:,\d{3})*(?:\.\d{2})?", "date": r"\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b", } def __init__(self) -> None: self.compiled_patterns = { name: re.compile(pattern, re.IGNORECASE) for name, pattern in self.PII_PATTERNS.items() } def detect_pii(self, text: str) -> list[dict[str, Any]]: """Detect PII in text and return matches with positions""" matches = [] for pii_type, pattern in self.compiled_patterns.items(): for match in pattern.finditer(text): matches.append( { "type": pii_type, "value": match.group(), "start": match.start(), "end": match.end(), "placeholder": self._generate_placeholder( pii_type, match.group() ), } ) return sorted(matches, key=lambda x: x["start"]) def de_identify_text(self, text: str) -> tuple[str, dict[str, str]]: """De-identify text by replacing PII with placeholders""" pii_matches = self.detect_pii(text) pii_mapping = {} # Replace PII from end to start to maintain positions de_identified = text for match in reversed(pii_matches): placeholder = match["placeholder"] pii_mapping[placeholder] = match["value"] de_identified = ( de_identified[: match["start"]] + placeholder + de_identified[match["end"] :] ) return de_identified, pii_mapping def _generate_placeholder(self, pii_type: str, value: str) -> str: """Generate consistent placeholder for PII value""" # Create hash of the value for consistent placeholders value_hash = hashlib.md5(value.encode()).hexdigest()[:8] return f"[{pii_type.upper()}_{value_hash}]" def has_pii(self, text: str) -> bool: """Check if text contains any PII""" return len(self.detect_pii(text)) > 0