Some checks failed
CI/CD Pipeline / Code Quality & Linting (push) Has been cancelled
CI/CD Pipeline / Policy Validation (push) Has been cancelled
CI/CD Pipeline / Test Suite (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-firm-connectors) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-forms) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-hmrc) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ingestion) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-normalize-map) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ocr) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-indexer) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-reason) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rpa) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (ui-review) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (ui-review) (push) Has been cancelled
CI/CD Pipeline / Generate SBOM (push) Has been cancelled
CI/CD Pipeline / Deploy to Staging (push) Has been cancelled
CI/CD Pipeline / Deploy to Production (push) Has been cancelled
CI/CD Pipeline / Notifications (push) Has been cancelled
78 lines
2.7 KiB
Python
78 lines
2.7 KiB
Python
"""PII detection and de-identification utilities."""
|
|
|
|
import hashlib
|
|
import re
|
|
from typing import Any
|
|
|
|
|
|
class PIIDetector:
|
|
"""PII detection and de-identification utilities"""
|
|
|
|
# Regex patterns for common PII
|
|
PII_PATTERNS = {
|
|
"uk_ni_number": r"\b[A-CEGHJ-PR-TW-Z]{2}\d{6}[A-D]\b",
|
|
"uk_utr": r"\b\d{10}\b",
|
|
"uk_postcode": r"\b[A-Z]{1,2}\d[A-Z0-9]?\s*\d[A-Z]{2}\b",
|
|
"uk_sort_code": r"\b\d{2}-\d{2}-\d{2}\b",
|
|
"uk_account_number": r"\b\d{8}\b",
|
|
"email": r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b",
|
|
"phone": r"\b(?:\+44|0)\d{10,11}\b",
|
|
"iban": r"\bGB\d{2}[A-Z]{4}\d{14}\b",
|
|
"amount": r"£\d{1,3}(?:,\d{3})*(?:\.\d{2})?",
|
|
"date": r"\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b",
|
|
}
|
|
|
|
def __init__(self) -> None:
|
|
self.compiled_patterns = {
|
|
name: re.compile(pattern, re.IGNORECASE)
|
|
for name, pattern in self.PII_PATTERNS.items()
|
|
}
|
|
|
|
def detect_pii(self, text: str) -> list[dict[str, Any]]:
|
|
"""Detect PII in text and return matches with positions"""
|
|
matches = []
|
|
|
|
for pii_type, pattern in self.compiled_patterns.items():
|
|
for match in pattern.finditer(text):
|
|
matches.append(
|
|
{
|
|
"type": pii_type,
|
|
"value": match.group(),
|
|
"start": match.start(),
|
|
"end": match.end(),
|
|
"placeholder": self._generate_placeholder(
|
|
pii_type, match.group()
|
|
),
|
|
}
|
|
)
|
|
|
|
return sorted(matches, key=lambda x: x["start"])
|
|
|
|
def de_identify_text(self, text: str) -> tuple[str, dict[str, str]]:
|
|
"""De-identify text by replacing PII with placeholders"""
|
|
pii_matches = self.detect_pii(text)
|
|
pii_mapping = {}
|
|
|
|
# Replace PII from end to start to maintain positions
|
|
de_identified = text
|
|
for match in reversed(pii_matches):
|
|
placeholder = match["placeholder"]
|
|
pii_mapping[placeholder] = match["value"]
|
|
de_identified = (
|
|
de_identified[: match["start"]]
|
|
+ placeholder
|
|
+ de_identified[match["end"] :]
|
|
)
|
|
|
|
return de_identified, pii_mapping
|
|
|
|
def _generate_placeholder(self, pii_type: str, value: str) -> str:
|
|
"""Generate consistent placeholder for PII value"""
|
|
# Create hash of the value for consistent placeholders
|
|
value_hash = hashlib.md5(value.encode()).hexdigest()[:8]
|
|
return f"[{pii_type.upper()}_{value_hash}]"
|
|
|
|
def has_pii(self, text: str) -> bool:
|
|
"""Check if text contains any PII"""
|
|
return len(self.detect_pii(text)) > 0
|