Initial commit
Some checks failed
CI/CD Pipeline / Code Quality & Linting (push) Has been cancelled
CI/CD Pipeline / Policy Validation (push) Has been cancelled
CI/CD Pipeline / Test Suite (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-firm-connectors) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-forms) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-hmrc) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ingestion) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-normalize-map) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ocr) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-indexer) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-reason) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rpa) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (ui-review) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (ui-review) (push) Has been cancelled
CI/CD Pipeline / Generate SBOM (push) Has been cancelled
CI/CD Pipeline / Deploy to Staging (push) Has been cancelled
CI/CD Pipeline / Deploy to Production (push) Has been cancelled
CI/CD Pipeline / Notifications (push) Has been cancelled
Some checks failed
CI/CD Pipeline / Code Quality & Linting (push) Has been cancelled
CI/CD Pipeline / Policy Validation (push) Has been cancelled
CI/CD Pipeline / Test Suite (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-firm-connectors) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-forms) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-hmrc) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ingestion) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-normalize-map) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ocr) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-indexer) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-reason) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rpa) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (ui-review) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (ui-review) (push) Has been cancelled
CI/CD Pipeline / Generate SBOM (push) Has been cancelled
CI/CD Pipeline / Deploy to Staging (push) Has been cancelled
CI/CD Pipeline / Deploy to Production (push) Has been cancelled
CI/CD Pipeline / Notifications (push) Has been cancelled
This commit is contained in:
77
libs/rag/pii_detector.py
Normal file
77
libs/rag/pii_detector.py
Normal file
@@ -0,0 +1,77 @@
|
||||
"""PII detection and de-identification utilities."""
|
||||
|
||||
import hashlib
|
||||
import re
|
||||
from typing import Any
|
||||
|
||||
|
||||
class PIIDetector:
|
||||
"""PII detection and de-identification utilities"""
|
||||
|
||||
# Regex patterns for common PII
|
||||
PII_PATTERNS = {
|
||||
"uk_ni_number": r"\b[A-CEGHJ-PR-TW-Z]{2}\d{6}[A-D]\b",
|
||||
"uk_utr": r"\b\d{10}\b",
|
||||
"uk_postcode": r"\b[A-Z]{1,2}\d[A-Z0-9]?\s*\d[A-Z]{2}\b",
|
||||
"uk_sort_code": r"\b\d{2}-\d{2}-\d{2}\b",
|
||||
"uk_account_number": r"\b\d{8}\b",
|
||||
"email": r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b",
|
||||
"phone": r"\b(?:\+44|0)\d{10,11}\b",
|
||||
"iban": r"\bGB\d{2}[A-Z]{4}\d{14}\b",
|
||||
"amount": r"£\d{1,3}(?:,\d{3})*(?:\.\d{2})?",
|
||||
"date": r"\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b",
|
||||
}
|
||||
|
||||
def __init__(self) -> None:
|
||||
self.compiled_patterns = {
|
||||
name: re.compile(pattern, re.IGNORECASE)
|
||||
for name, pattern in self.PII_PATTERNS.items()
|
||||
}
|
||||
|
||||
def detect_pii(self, text: str) -> list[dict[str, Any]]:
|
||||
"""Detect PII in text and return matches with positions"""
|
||||
matches = []
|
||||
|
||||
for pii_type, pattern in self.compiled_patterns.items():
|
||||
for match in pattern.finditer(text):
|
||||
matches.append(
|
||||
{
|
||||
"type": pii_type,
|
||||
"value": match.group(),
|
||||
"start": match.start(),
|
||||
"end": match.end(),
|
||||
"placeholder": self._generate_placeholder(
|
||||
pii_type, match.group()
|
||||
),
|
||||
}
|
||||
)
|
||||
|
||||
return sorted(matches, key=lambda x: x["start"])
|
||||
|
||||
def de_identify_text(self, text: str) -> tuple[str, dict[str, str]]:
|
||||
"""De-identify text by replacing PII with placeholders"""
|
||||
pii_matches = self.detect_pii(text)
|
||||
pii_mapping = {}
|
||||
|
||||
# Replace PII from end to start to maintain positions
|
||||
de_identified = text
|
||||
for match in reversed(pii_matches):
|
||||
placeholder = match["placeholder"]
|
||||
pii_mapping[placeholder] = match["value"]
|
||||
de_identified = (
|
||||
de_identified[: match["start"]]
|
||||
+ placeholder
|
||||
+ de_identified[match["end"] :]
|
||||
)
|
||||
|
||||
return de_identified, pii_mapping
|
||||
|
||||
def _generate_placeholder(self, pii_type: str, value: str) -> str:
|
||||
"""Generate consistent placeholder for PII value"""
|
||||
# Create hash of the value for consistent placeholders
|
||||
value_hash = hashlib.md5(value.encode()).hexdigest()[:8]
|
||||
return f"[{pii_type.upper()}_{value_hash}]"
|
||||
|
||||
def has_pii(self, text: str) -> bool:
|
||||
"""Check if text contains any PII"""
|
||||
return len(self.detect_pii(text)) > 0
|
||||
Reference in New Issue
Block a user