Initial commit

2025-10-11 08:41:36 +01:00
commit b324ff09ef
276 changed files with 55220 additions and 0 deletions
--- a/pipeline/etl.py
+++ b/pipeline/etl.py
@@ -0,0 +1,420 @@
+# FILE: pipeline/etl.py
+
+import hashlib
+import json
+import logging
+from dataclasses import dataclass
+from datetime import datetime
+from pathlib import Path
+
+import cv2
+import numpy as np
+import pytesseract
+import yaml
+from pdf2image import convert_from_path
+
+from .llm_client import LLMClient
+from .mappers import GraphMapper
+from .normalizers import CurrencyNormalizer, DateNormalizer, PartyNormalizer
+from .validators import DocumentValidator, FieldValidator
+
+
+@dataclass
+class ExtractionResult:
+    doc_id: str
+    classification: str
+    confidence: float
+    extracted_data: dict
+    evidence: list[dict]
+    errors: list[str]
+    processing_time: float
+
+
+class DocumentETL:
+    def __init__(self, config_path: str):
+        with open(config_path) as f:
+            self.config = yaml.safe_load(f)
+
+        self.validator = DocumentValidator(self.config)
+        self.field_validator = FieldValidator(self.config)
+        self.currency_normalizer = CurrencyNormalizer(self.config)
+        self.date_normalizer = DateNormalizer(self.config)
+        self.party_normalizer = PartyNormalizer(self.config)
+        self.graph_mapper = GraphMapper(self.config)
+        self.llm_client = LLMClient(self.config)
+
+        self.logger = logging.getLogger(__name__)
+
+    def process_document(self, file_path: str, taxpayer_id: str) -> ExtractionResult:
+        """Main ETL pipeline entry point"""
+        start_time = datetime.now()
+        doc_id = self._generate_doc_id(file_path)
+
+        try:
+            # Stage 1: Ingest and preprocess
+            images, metadata = self._ingest_document(file_path)
+
+            # Stage 2: Classify document type
+            classification, class_confidence = self._classify_document(
+                images[0], metadata
+            )
+
+            # Stage 3: OCR and layout analysis
+            ocr_results = self._perform_ocr(images)
+
+            # Stage 4: Extract structured data using LLM
+            extracted_data = self._extract_structured_data(
+                ocr_results, classification, doc_id
+            )
+
+            # Stage 5: Validate extracted data
+            validation_errors = self._validate_extraction(
+                extracted_data, classification
+            )
+
+            # Stage 6: Normalize and standardize
+            normalized_data = self._normalize_data(extracted_data)
+
+            # Stage 7: Map to knowledge graph
+            graph_nodes, graph_edges = self._map_to_graph(
+                normalized_data, doc_id, taxpayer_id
+            )
+
+            # Stage 8: Post-processing checks
+            final_errors = self._post_process_checks(
+                graph_nodes, graph_edges, validation_errors
+            )
+
+            processing_time = (datetime.now() - start_time).total_seconds()
+
+            return ExtractionResult(
+                doc_id=doc_id,
+                classification=classification,
+                confidence=class_confidence,
+                extracted_data=normalized_data,
+                evidence=self._create_evidence_records(ocr_results, doc_id),
+                errors=final_errors,
+                processing_time=processing_time,
+            )
+
+        except Exception as e:
+            self.logger.error(f"ETL pipeline failed for {file_path}: {str(e)}")
+            processing_time = (datetime.now() - start_time).total_seconds()
+
+            return ExtractionResult(
+                doc_id=doc_id,
+                classification="unknown",
+                confidence=0.0,
+                extracted_data={},
+                evidence=[],
+                errors=[f"Pipeline failure: {str(e)}"],
+                processing_time=processing_time,
+            )
+
+    def _generate_doc_id(self, file_path: str) -> str:
+        """Generate deterministic document ID"""
+        with open(file_path, "rb") as f:
+            content = f.read()
+        checksum = hashlib.sha256(content).hexdigest()
+        return f"doc_{checksum[:16]}"
+
+    def _ingest_document(self, file_path: str) -> tuple[list[np.ndarray], dict]:
+        """Convert document to images and extract metadata"""
+        file_path = Path(file_path)
+
+        if file_path.suffix.lower() == ".pdf":
+            # Convert PDF to images
+            pil_images = convert_from_path(str(file_path), dpi=300)
+            images = [np.array(img) for img in pil_images]
+        else:
+            # Handle image files
+            img = cv2.imread(str(file_path))
+            if img is None:
+                raise ValueError(f"Could not read image file: {file_path}")
+            images = [img]
+
+        # Preprocess images
+        processed_images = []
+        for img in images:
+            # Deskew and rotate
+            processed_img = self._deskew_image(img)
+            processed_img = self._auto_rotate(processed_img)
+            processed_images.append(processed_img)
+
+        metadata = {
+            "file_path": str(file_path),
+            "file_size": file_path.stat().st_size,
+            "mime_type": self._get_mime_type(file_path),
+            "pages": len(processed_images),
+            "created_at": datetime.now().isoformat(),
+        }
+
+        return processed_images, metadata
+
+    def _classify_document(
+        self, image: np.ndarray, metadata: dict
+    ) -> tuple[str, float]:
+        """Classify document type using OCR + LLM"""
+        # Quick OCR for classification
+        text = pytesseract.image_to_string(image)
+
+        # Use LLM for classification
+        classification_prompt = self._load_prompt("doc_classify")
+        classification_result = self.llm_client.classify_document(
+            text[:2000],
+            classification_prompt,  # First 2000 chars for classification
+        )
+
+        return classification_result["type"], classification_result["confidence"]
+
+    def _perform_ocr(self, images: list[np.ndarray]) -> list[dict]:
+        """Perform OCR with layout analysis"""
+        ocr_results = []
+
+        for page_num, image in enumerate(images, 1):
+            # Get detailed OCR data with bounding boxes
+            ocr_data = pytesseract.image_to_data(
+                image,
+                output_type=pytesseract.Output.DICT,
+                config="--psm 6",  # Uniform block of text
+            )
+
+            # Extract text blocks with confidence and position
+            blocks = []
+            for i in range(len(ocr_data["text"])):
+                if int(ocr_data["conf"][i]) > 30:  # Confidence threshold
+                    blocks.append(
+                        {
+                            "text": ocr_data["text"][i],
+                            "confidence": int(ocr_data["conf"][i]) / 100.0,
+                            "bbox": {
+                                "x": ocr_data["left"][i],
+                                "y": ocr_data["top"][i],
+                                "width": ocr_data["width"][i],
+                                "height": ocr_data["height"][i],
+                            },
+                            "page": page_num,
+                        }
+                    )
+
+            # Detect tables using layout analysis
+            tables = self._detect_tables(image, blocks)
+
+            ocr_results.append(
+                {
+                    "page": page_num,
+                    "blocks": blocks,
+                    "tables": tables,
+                    "full_text": " ".join([b["text"] for b in blocks]),
+                }
+            )
+
+        return ocr_results
+
+    def _extract_structured_data(
+        self, ocr_results: list[dict], classification: str, doc_id: str
+    ) -> dict:
+        """Extract structured data using LLM with schema constraints"""
+
+        # Load appropriate extraction prompt
+        if classification == "bank_statement":
+            prompt = self._load_prompt("bank_statement_extract")
+            schema = self._load_schema("bank_statement")
+        elif classification == "invoice":
+            prompt = self._load_prompt("invoice_extract")
+            schema = self._load_schema("invoice")
+        elif classification == "payslip":
+            prompt = self._load_prompt("payslip_extract")
+            schema = self._load_schema("payslip")
+        else:
+            prompt = self._load_prompt("kv_extract")
+            schema = self._load_schema("generic")
+
+        # Combine OCR results
+        combined_text = "\n".join(
+            [f"Page {r['page']}:\n{r['full_text']}" for r in ocr_results]
+        )
+
+        # Extract with retry logic
+        max_retries = 3
+        for attempt in range(max_retries):
+            try:
+                extracted = self.llm_client.extract_structured_data(
+                    combined_text,
+                    prompt,
+                    schema,
+                    temperature=0.1 if attempt == 0 else 0.3,
+                )
+
+                # Validate against schema
+                if self.field_validator.validate_schema(extracted, schema):
+                    return extracted
+                else:
+                    self.logger.warning(
+                        f"Schema validation failed, attempt {attempt + 1}"
+                    )
+
+            except Exception as e:
+                self.logger.warning(
+                    f"Extraction attempt {attempt + 1} failed: {str(e)}"
+                )
+
+        # Fallback to basic key-value extraction
+        return self._fallback_extraction(ocr_results)
+
+    def _normalize_data(self, extracted_data: dict) -> dict:
+        """Normalize extracted data to canonical formats"""
+        normalized = extracted_data.copy()
+
+        # Normalize currencies
+        for field in ["amount", "gross", "net", "tax_withheld"]:
+            if field in normalized:
+                normalized[field] = self.currency_normalizer.normalize(
+                    normalized[field]
+                )
+
+        # Normalize dates
+        for field in ["date", "period_start", "period_end", "due_date"]:
+            if field in normalized:
+                normalized[field] = self.date_normalizer.normalize(normalized[field])
+
+        # Normalize party names
+        for field in ["payer_name", "employer_name", "supplier_name"]:
+            if field in normalized:
+                normalized[field] = self.party_normalizer.normalize(normalized[field])
+
+        return normalized
+
+    def _map_to_graph(
+        self, normalized_data: dict, doc_id: str, taxpayer_id: str
+    ) -> tuple[list[dict], list[dict]]:
+        """Map normalized data to knowledge graph nodes and edges"""
+        return self.graph_mapper.map_to_graph(normalized_data, doc_id, taxpayer_id)
+
+    def _deskew_image(self, image: np.ndarray) -> np.ndarray:
+        """Correct skew in scanned documents"""
+        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+        edges = cv2.Canny(gray, 50, 150, apertureSize=3)
+        lines = cv2.HoughLines(edges, 1, np.pi / 180, threshold=100)
+
+        if lines is not None:
+            angles = []
+            for rho, theta in lines[:10]:  # Use first 10 lines
+                angle = theta * 180 / np.pi
+                if angle < 45:
+                    angles.append(angle)
+                elif angle > 135:
+                    angles.append(angle - 180)
+
+            if angles:
+                median_angle = np.median(angles)
+                if abs(median_angle) > 0.5:  # Only rotate if significant skew
+                    (h, w) = image.shape[:2]
+                    center = (w // 2, h // 2)
+                    M = cv2.getRotationMatrix2D(center, median_angle, 1.0)
+                    return cv2.warpAffine(
+                        image,
+                        M,
+                        (w, h),
+                        flags=cv2.INTER_CUBIC,
+                        borderMode=cv2.BORDER_REPLICATE,
+                    )
+
+        return image
+
+    def _auto_rotate(self, image: np.ndarray) -> np.ndarray:
+        """Auto-rotate image to correct orientation"""
+        # Use Tesseract's orientation detection
+        try:
+            osd = pytesseract.image_to_osd(image)
+            rotation = int(
+                [line for line in osd.split("\n") if "Rotate:" in line][0]
+                .split(":")[1]
+                .strip()
+            )
+
+            if rotation != 0:
+                (h, w) = image.shape[:2]
+                center = (w // 2, h // 2)
+                M = cv2.getRotationMatrix2D(center, rotation, 1.0)
+                return cv2.warpAffine(image, M, (w, h))
+        except:
+            pass  # If OSD fails, return original
+
+        return image
+
+    def _detect_tables(self, image: np.ndarray, blocks: list[dict]) -> list[dict]:
+        """Detect and extract table structures"""
+        # Simple table detection using horizontal/vertical lines
+        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+
+        # Detect horizontal lines
+        horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (40, 1))
+        horizontal_lines = cv2.morphologyEx(gray, cv2.MORPH_OPEN, horizontal_kernel)
+
+        # Detect vertical lines
+        vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 40))
+        vertical_lines = cv2.morphologyEx(gray, cv2.MORPH_OPEN, vertical_kernel)
+
+        # Find table regions
+        table_mask = cv2.addWeighted(horizontal_lines, 0.5, vertical_lines, 0.5, 0.0)
+        contours, _ = cv2.findContours(
+            table_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
+        )
+
+        tables = []
+        for contour in contours:
+            x, y, w, h = cv2.boundingRect(contour)
+            if w > 200 and h > 100:  # Minimum table size
+                # Extract text blocks within table region
+                table_blocks = [
+                    block
+                    for block in blocks
+                    if (
+                        block["bbox"]["x"] >= x
+                        and block["bbox"]["y"] >= y
+                        and block["bbox"]["x"] + block["bbox"]["width"] <= x + w
+                        and block["bbox"]["y"] + block["bbox"]["height"] <= y + h
+                    )
+                ]
+
+                tables.append(
+                    {
+                        "bbox": {"x": x, "y": y, "width": w, "height": h},
+                        "blocks": table_blocks,
+                    }
+                )
+
+        return tables
+
+    def _load_prompt(self, prompt_name: str) -> str:
+        """Load LLM prompt template"""
+        prompt_path = Path(f"prompts/{prompt_name}.txt")
+        with open(prompt_path) as f:
+            return f.read()
+
+    def _load_schema(self, schema_name: str) -> dict:
+        """Load JSON schema for validation"""
+        schema_path = Path(f"schemas/{schema_name}.schema.json")
+        with open(schema_path) as f:
+            return json.load(f)
+
+    def _create_evidence_records(
+        self, ocr_results: list[dict], doc_id: str
+    ) -> list[dict]:
+        """Create evidence records with provenance"""
+        evidence = []
+        for page_result in ocr_results:
+            for block in page_result["blocks"]:
+                evidence.append(
+                    {
+                        "snippet_id": f"{doc_id}_p{page_result['page']}_{len(evidence)}",
+                        "doc_ref": doc_id,
+                        "page": page_result["page"],
+                        "bbox": block["bbox"],
+                        "text_hash": hashlib.sha256(block["text"].encode()).hexdigest(),
+                        "ocr_confidence": block["confidence"],
+                        "extracted_text": block["text"],
+                    }
+                )
+        return evidence