"""Core coverage evaluation engine.""" from datetime import datetime from typing import Any import structlog from ..schemas import ( BlockingItem, Citation, CompiledCoveragePolicy, CoverageItem, CoverageReport, FoundEvidence, OverallStatus, Role, ScheduleCoverage, Status, ) logger = structlog.get_logger() class CoverageEvaluator: """Core coverage evaluation engine""" def __init__(self, kg_client: Any = None, rag_client: Any = None): self.kg_client = kg_client self.rag_client = rag_client async def check_document_coverage( self, taxpayer_id: str, tax_year: str, policy: CompiledCoveragePolicy, ) -> CoverageReport: """Main coverage evaluation workflow""" logger.info( "Starting coverage evaluation", taxpayer_id=taxpayer_id, tax_year=tax_year, policy_version=policy.policy.version, ) # Step A: Infer required schedules required_schedules = await self.infer_required_schedules( taxpayer_id, tax_year, policy ) # Step B: Evaluate each schedule schedule_coverage = [] all_blocking_items = [] for schedule_id in required_schedules: coverage = await self._evaluate_schedule_coverage( schedule_id, taxpayer_id, tax_year, policy ) schedule_coverage.append(coverage) # Collect blocking items for evidence in coverage.evidence: if evidence.role == Role.REQUIRED and evidence.status == Status.MISSING: all_blocking_items.append( BlockingItem(schedule_id=schedule_id, evidence_id=evidence.id) ) # Step C: Determine overall status overall_status = self._determine_overall_status( schedule_coverage, all_blocking_items ) return CoverageReport( tax_year=tax_year, taxpayer_id=taxpayer_id, schedules_required=required_schedules, overall_status=overall_status, coverage=schedule_coverage, blocking_items=all_blocking_items, policy_version=policy.policy.version, ) async def infer_required_schedules( self, taxpayer_id: str, tax_year: str, policy: CompiledCoveragePolicy, ) -> list[str]: """Determine which schedules are required for this taxpayer""" required = [] for schedule_id, trigger in policy.policy.triggers.items(): is_required = False # Check any_of conditions if trigger.any_of: for condition in trigger.any_of: predicate = policy.compiled_predicates.get(condition) if predicate and predicate(taxpayer_id, tax_year): is_required = True break # Check all_of conditions if trigger.all_of and not is_required: all_match = True for condition in trigger.all_of: predicate = policy.compiled_predicates.get(condition) if not predicate or not predicate(taxpayer_id, tax_year): all_match = False break if all_match: is_required = True if is_required: required.append(schedule_id) logger.debug( "Schedule required", schedule_id=schedule_id, taxpayer_id=taxpayer_id, ) return required async def find_evidence_docs( self, taxpayer_id: str, tax_year: str, evidence_ids: list[str], policy: CompiledCoveragePolicy, ) -> dict[str, list[FoundEvidence]]: """Find evidence documents in the knowledge graph""" if not self.kg_client: logger.warning("No KG client available, returning empty evidence") empty_evidence_list: list[FoundEvidence] = [] return dict.fromkeys(evidence_ids, empty_evidence_list) # Import here to avoid circular imports from ..neo import kg_find_evidence evidence_map: dict[str, list[FoundEvidence]] = {} thresholds = policy.policy.defaults.confidence_thresholds for evidence_id in evidence_ids: try: found = await kg_find_evidence( self.kg_client, taxpayer_id=taxpayer_id, tax_year=tax_year, kinds=[evidence_id], min_ocr=thresholds.get("ocr", 0.6), date_window=policy.policy.defaults.date_tolerance_days, ) evidence_map[evidence_id] = found except Exception as e: logger.error( "Failed to find evidence", evidence_id=evidence_id, error=str(e), ) empty_list: list[FoundEvidence] = [] evidence_map[evidence_id] = empty_list return evidence_map def classify_status( self, found: list[FoundEvidence], policy: CompiledCoveragePolicy, tax_year: str, ) -> Status: """Classify evidence status based on what was found""" if not found: return Status.MISSING classifier = policy.policy.status_classifier tax_year_start, tax_year_end = self._parse_tax_year_bounds( policy.policy.tax_year_boundary.start, policy.policy.tax_year_boundary.end, ) # Check for conflicts first if len(found) > 1: # Simple conflict detection: different totals for same period # In production, this would be more sophisticated return Status.CONFLICTING evidence = found[0] # Check if evidence meets verified criteria if ( evidence.ocr_confidence >= classifier.present_verified.min_ocr and evidence.extract_confidence >= classifier.present_verified.min_extract ): # Check date validity if evidence.date: # Handle both date-only and datetime strings consistently if "T" not in evidence.date: # Date-only string, add time and timezone (middle of day) evidence_date = datetime.fromisoformat( evidence.date + "T12:00:00+00:00" ) else: # Full datetime string, ensure timezone-aware evidence_date = datetime.fromisoformat( evidence.date.replace("Z", "+00:00") ) if tax_year_start <= evidence_date <= tax_year_end: return Status.PRESENT_VERIFIED # Check if evidence meets unverified criteria if ( evidence.ocr_confidence >= classifier.present_unverified.min_ocr and evidence.extract_confidence >= classifier.present_unverified.min_extract ): return Status.PRESENT_UNVERIFIED # Default to missing if confidence too low return Status.MISSING async def build_reason_and_citations( self, schedule_id: str, evidence_item: Any, status: Status, taxpayer_id: str, tax_year: str, policy: CompiledCoveragePolicy, ) -> tuple[str, list[Citation]]: """Build human-readable reason and citations""" # Build reason text reason = self._build_reason_text(evidence_item, status, policy) # Get citations from KG citations = [] if self.kg_client: try: from ..neo import kg_rule_citations kg_citations = await kg_rule_citations( self.kg_client, schedule_id, evidence_item.boxes ) citations.extend(kg_citations) except Exception as e: logger.warning("Failed to get KG citations", error=str(e)) # Fallback to RAG citations if needed if not citations and self.rag_client: try: from ..rag import rag_search_for_citations query = f"{schedule_id} {evidence_item.id} requirements" filters = { "jurisdiction": policy.policy.jurisdiction, "tax_year": tax_year, "pii_free": True, } rag_citations = await rag_search_for_citations( self.rag_client, query, filters ) citations.extend(rag_citations) except Exception as e: logger.warning("Failed to get RAG citations", error=str(e)) return reason, citations async def _evaluate_schedule_coverage( self, schedule_id: str, taxpayer_id: str, tax_year: str, policy: CompiledCoveragePolicy, ) -> ScheduleCoverage: """Evaluate coverage for a single schedule""" schedule_policy = policy.policy.schedules[schedule_id] evidence_items = [] # Get all evidence IDs for this schedule evidence_ids = [e.id for e in schedule_policy.evidence] # Find evidence in KG evidence_map = await self.find_evidence_docs( taxpayer_id, tax_year, evidence_ids, policy ) # Evaluate each evidence requirement for evidence_req in schedule_policy.evidence: # Check if conditionally required evidence applies if ( evidence_req.role == Role.CONDITIONALLY_REQUIRED and evidence_req.condition ): predicate = policy.compiled_predicates.get(evidence_req.condition) if not predicate or not predicate(taxpayer_id, tax_year): continue # Skip this evidence as condition not met found = evidence_map.get(evidence_req.id, []) status = self.classify_status(found, policy, tax_year) reason, citations = await self.build_reason_and_citations( schedule_id, evidence_req, status, taxpayer_id, tax_year, policy ) evidence_item = CoverageItem( id=evidence_req.id, role=evidence_req.role, status=status, boxes=evidence_req.boxes, found=found, acceptable_alternatives=evidence_req.acceptable_alternatives, reason=reason, citations=citations, ) evidence_items.append(evidence_item) # Determine schedule status schedule_status = self._determine_schedule_status(evidence_items) return ScheduleCoverage( schedule_id=schedule_id, status=schedule_status, evidence=evidence_items, ) def _determine_overall_status( self, schedule_coverage: list[ScheduleCoverage], blocking_items: list[BlockingItem], ) -> OverallStatus: """Determine overall coverage status""" if blocking_items: return OverallStatus.BLOCKING # Check if all schedules are OK all_ok = all(s.status == OverallStatus.OK for s in schedule_coverage) if all_ok: return OverallStatus.OK return OverallStatus.PARTIAL def _determine_schedule_status( self, evidence_items: list[CoverageItem] ) -> OverallStatus: """Determine status for a single schedule""" # Check for blocking issues has_missing_required = any( e.role == Role.REQUIRED and e.status == Status.MISSING for e in evidence_items ) if has_missing_required: return OverallStatus.BLOCKING # Check for partial issues has_unverified = any( e.status == Status.PRESENT_UNVERIFIED for e in evidence_items ) if has_unverified: return OverallStatus.PARTIAL return OverallStatus.OK def _build_reason_text( self, evidence_item: Any, status: Status, policy: CompiledCoveragePolicy, ) -> str: """Build human-readable reason text""" evidence_id = evidence_item.id # Get reason from policy if available if evidence_item.reasons and "short" in evidence_item.reasons: base_reason = evidence_item.reasons["short"] else: base_reason = f"{evidence_id} is required for this schedule." # Add status-specific details if status == Status.MISSING: return f"No {evidence_id} found. {base_reason}" elif status == Status.PRESENT_UNVERIFIED: return ( f"{evidence_id} present but confidence below threshold. {base_reason}" ) elif status == Status.CONFLICTING: return f"Conflicting {evidence_id} documents found. {base_reason}" else: return f"{evidence_id} verified. {base_reason}" def _parse_tax_year_bounds( self, start_str: str, end_str: str ) -> tuple[datetime, datetime]: """Parse tax year boundary strings to datetime objects""" # Handle both date-only and datetime strings if "T" not in start_str: # Date-only string, add time and timezone start = datetime.fromisoformat(start_str + "T00:00:00+00:00") else: # Full datetime string, ensure timezone-aware start = datetime.fromisoformat(start_str.replace("Z", "+00:00")) if "T" not in end_str: # Date-only string, add time and timezone (end of day) end = datetime.fromisoformat(end_str + "T23:59:59+00:00") else: # Full datetime string, ensure timezone-aware end = datetime.fromisoformat(end_str.replace("Z", "+00:00")) return start, end