# FILE: apps/svc-rpa/main.py # mypy: disable-error-code=union-attr # Playwright automation for portal data extraction (HMRC, banks, etc.) import asyncio import os # Import shared libraries import sys from datetime import datetime from typing import Any import structlog import ulid from fastapi import BackgroundTasks, Depends, HTTPException, Request from fastapi.responses import JSONResponse from playwright.async_api import Browser, Page, async_playwright sys.path.append(os.path.join(os.path.dirname(__file__), "..", "..")) from libs.app_factory import create_app from libs.config import BaseAppSettings, create_event_bus, create_vault_client from libs.events import EventBus, EventPayload from libs.observability import get_metrics, get_tracer, setup_observability from libs.schemas import ErrorResponse from libs.security import VaultTransitHelper, get_current_user, get_tenant_id logger = structlog.get_logger() class RPASettings(BaseAppSettings): """Settings for RPA service""" service_name: str = "svc-rpa" # Browser configuration browser_type: str = "chromium" # chromium, firefox, webkit headless: bool = True timeout: int = 30000 # 30 seconds # Portal configurations hmrc_base_url: str = "https://www.gov.uk/log-in-hmrc-online-services" open_banking_enabled: bool = False # Security max_concurrent_sessions: int = 5 session_timeout: int = 300 # 5 minutes # Create app and settings app, settings = create_app( service_name="svc-rpa", title="Tax Agent RPA Service", description="Robotic Process Automation for portal data extraction", settings_class=RPASettings, ) # Global clients vault_helper: VaultTransitHelper | None = None event_bus: EventBus | None = None browser: Browser | None = None active_sessions: dict[str, dict[str, Any]] = {} tracer = get_tracer("svc-rpa") metrics = get_metrics() @app.on_event("startup") async def startup_event() -> None: """Initialize service dependencies""" global vault_helper, event_bus, browser logger.info("Starting RPA service") # Setup observability setup_observability(settings) # Initialize Vault helper vault_client = create_vault_client(settings) vault_helper = VaultTransitHelper(vault_client, "tax-agent-transit") # Initialize event bus event_bus = create_event_bus(settings) await event_bus.start() # fmt: skip # pyright: ignore[reportOptionalMemberAccess] # Initialize browser playwright = await async_playwright().start() browser = await playwright[settings.browser_type].launch( headless=settings.headless, args=["--no-sandbox", "--disable-dev-shm-usage"] if settings.headless else [], ) logger.info("RPA service started successfully") @app.on_event("shutdown") async def shutdown_event() -> None: """Cleanup service dependencies""" global event_bus, browser logger.info("Shutting down RPA service") if browser: await browser.close() if event_bus: await event_bus.stop() logger.info("RPA service shutdown complete") @app.get("/health") async def health_check() -> dict[str, Any]: """Health check endpoint""" return { "status": "healthy", "service": settings.service_name, "version": settings.service_version, "timestamp": datetime.utcnow().isoformat(), "active_sessions": len(active_sessions), } @app.post("/sessions") async def create_session( portal: str, background_tasks: BackgroundTasks, current_user: dict[str, Any] = Depends(get_current_user), tenant_id: str = Depends(get_tenant_id), ) -> dict[str, Any]: """Create new RPA session""" with tracer.start_as_current_span("create_session") as span: span.set_attribute("portal", portal) span.set_attribute("tenant_id", tenant_id) try: # Check session limits if len(active_sessions) >= settings.max_concurrent_sessions: raise HTTPException(status_code=429, detail="Too many active sessions") # Generate session ID session_id = str(ulid.new()) span.set_attribute("session_id", session_id) # Create browser context context = await browser.new_context( # pyright: ignore[reportOptionalMemberAccess] viewport={"width": 1920, "height": 1080}, user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36", ) page = await context.new_page() # Store session active_sessions[session_id] = { "context": context, "page": page, "portal": portal, "tenant_id": tenant_id, "user_id": current_user.get("sub"), "created_at": datetime.utcnow(), "last_activity": datetime.utcnow(), } # Schedule session cleanup background_tasks.add_task( _cleanup_session_after_timeout, session_id, settings.session_timeout ) logger.info("RPA session created", session_id=session_id, portal=portal) return { "session_id": session_id, "portal": portal, "status": "created", "expires_at": ( datetime.utcnow().timestamp() + settings.session_timeout ), } except Exception as e: logger.error("Failed to create session", error=str(e)) raise HTTPException(status_code=500, detail="Failed to create session") @app.post("/sessions/{session_id}/navigate") async def navigate_to_url( session_id: str, url: str, current_user: dict[str, Any] = Depends(get_current_user), tenant_id: str = Depends(get_tenant_id), ) -> dict[str, Any]: """Navigate to URL in session""" with tracer.start_as_current_span("navigate") as span: span.set_attribute("session_id", session_id) span.set_attribute("url", url) try: session = _get_session(session_id, tenant_id) page = session["page"] # Navigate to URL response = await page.goto(url, timeout=settings.timeout) # Update last activity session["last_activity"] = datetime.utcnow() # Take screenshot for debugging await page.screenshot() logger.info( "Navigated to URL", session_id=session_id, url=url, status=response.status, ) return { "status": "success", "url": page.url, "title": await page.title(), "response_status": response.status, } except Exception as e: logger.error( "Navigation failed", session_id=session_id, url=url, error=str(e) ) raise HTTPException(status_code=500, detail=f"Navigation failed: {str(e)}") @app.post("/sessions/{session_id}/login") async def login_to_portal( session_id: str, credentials: dict[str, str], current_user: dict[str, Any] = Depends(get_current_user), tenant_id: str = Depends(get_tenant_id), ) -> dict[str, Any]: """Login to portal using encrypted credentials""" with tracer.start_as_current_span("login") as span: span.set_attribute("session_id", session_id) try: session = _get_session(session_id, tenant_id) page = session["page"] portal = session["portal"] # Decrypt credentials decrypted_credentials: dict[str, Any] = {} for key, encrypted_value in credentials.items(): decrypted_credentials[key] = ( vault_helper.decrypt_field( # pyright: ignore[reportOptionalMemberAccess] key_name=key, ciphertext=encrypted_value ) ) # Perform login based on portal type if portal == "hmrc": success = await _login_hmrc(page, decrypted_credentials) elif portal == "open_banking": success = await _login_open_banking(page, decrypted_credentials) else: raise ValueError(f"Unsupported portal: {portal}") # Update session session["last_activity"] = datetime.utcnow() session["authenticated"] = success if success: logger.info("Login successful", session_id=session_id, portal=portal) return {"status": "success", "authenticated": True} else: logger.warning("Login failed", session_id=session_id, portal=portal) return {"status": "failed", "authenticated": False} except Exception as e: logger.error("Login error", session_id=session_id, error=str(e)) raise HTTPException(status_code=500, detail=f"Login failed: {str(e)}") @app.post("/sessions/{session_id}/extract") async def extract_data( session_id: str, extraction_config: dict[str, Any], current_user: dict[str, Any] = Depends(get_current_user), tenant_id: str = Depends(get_tenant_id), ) -> dict[str, Any]: """Extract data from portal""" with tracer.start_as_current_span("extract_data") as span: span.set_attribute("session_id", session_id) try: session = _get_session(session_id, tenant_id) page = session["page"] portal = session["portal"] # Check authentication if not session.get("authenticated", False): raise HTTPException(status_code=401, detail="Session not authenticated") # Extract data based on portal and config if portal == "hmrc": extracted_data = await _extract_hmrc_data(page, extraction_config) elif portal == "open_banking": extracted_data = await _extract_banking_data(page, extraction_config) else: raise ValueError(f"Unsupported portal: {portal}") # Update session session["last_activity"] = datetime.utcnow() # Publish extraction event event_payload = EventPayload( data={ "session_id": session_id, "portal": portal, "extraction_config": extraction_config, "extracted_data": extracted_data, "tenant_id": tenant_id, }, actor=current_user.get("sub", "system"), tenant_id=tenant_id, trace_id=span.get_span_context().trace_id, ) await event_bus.publish("rpa.data_extracted", event_payload) # fmt: skip # pyright: ignore[reportOptionalMemberAccess] logger.info( "Data extracted", session_id=session_id, portal=portal, records_count=len(extracted_data.get("records", [])), ) return { "status": "success", "extracted_data": extracted_data, "records_count": len(extracted_data.get("records", [])), } except Exception as e: logger.error("Data extraction failed", session_id=session_id, error=str(e)) raise HTTPException(status_code=500, detail=f"Extraction failed: {str(e)}") @app.delete("/sessions/{session_id}") async def close_session( session_id: str, current_user: dict[str, Any] = Depends(get_current_user), tenant_id: str = Depends(get_tenant_id), ) -> dict[str, str]: """Close RPA session""" with tracer.start_as_current_span("close_session") as span: span.set_attribute("session_id", session_id) try: session = _get_session(session_id, tenant_id) # Close browser context await session["context"].close() # Remove from active sessions del active_sessions[session_id] logger.info("Session closed", session_id=session_id) return {"status": "closed"} except Exception as e: logger.error("Failed to close session", session_id=session_id, error=str(e)) raise HTTPException(status_code=500, detail="Failed to close session") def _get_session(session_id: str, tenant_id: str) -> dict[str, Any]: """Get and validate session""" if session_id not in active_sessions: raise HTTPException(status_code=404, detail="Session not found") session = active_sessions[session_id] # Check tenant access if session["tenant_id"] != tenant_id: raise HTTPException(status_code=403, detail="Access denied") # Check timeout if ( datetime.utcnow() - session["last_activity"] ).seconds > settings.session_timeout: raise HTTPException(status_code=408, detail="Session expired") return session async def _login_hmrc(page: Page, credentials: dict[str, str]) -> bool: """Login to HMRC portal""" try: # Navigate to HMRC login await page.goto(settings.hmrc_base_url) # Wait for login form await page.wait_for_selector('input[name="userId"]', timeout=settings.timeout) # Fill credentials await page.fill('input[name="userId"]', credentials.get("user_id", "")) await page.fill('input[name="password"]', credentials.get("password", "")) # Submit form await page.click('button[type="submit"]') # Wait for redirect or error await page.wait_for_load_state("networkidle") # Check if login was successful current_url = page.url return "sign-in" not in current_url.lower() except Exception as e: logger.error("HMRC login failed", error=str(e)) return False async def _login_open_banking(page: Page, credentials: dict[str, str]) -> bool: """Login to Open Banking portal""" try: # This would implement Open Banking login flow # For now, return False as it's not implemented logger.warning("Open Banking login not implemented") return False except Exception as e: logger.error("Open Banking login failed", error=str(e)) return False async def _extract_hmrc_data(page: Page, config: dict[str, Any]) -> dict[str, Any]: """Extract data from HMRC portal""" try: data_type = config.get("data_type", "tax_returns") tax_year = config.get("tax_year", "2023-24") extracted_data = { "data_type": data_type, "tax_year": tax_year, "records": [], "extracted_at": datetime.utcnow().isoformat(), } if data_type == "tax_returns": # Navigate to tax returns section await page.click('a[href*="tax-return"]') await page.wait_for_load_state("networkidle") # Extract return data returns = await page.query_selector_all(".tax-return-item") for return_element in returns: return_data = await return_element.evaluate( """ element => ({ year: element.querySelector('.tax-year')?.textContent?.trim(), status: element.querySelector('.status')?.textContent?.trim(), amount: element.querySelector('.amount')?.textContent?.trim() }) """ ) extracted_data["records"].append(return_data) return extracted_data except Exception as e: logger.error("HMRC data extraction failed", error=str(e)) return {"error": str(e), "records": []} async def _extract_banking_data(page: Page, config: dict[str, Any]) -> dict[str, Any]: """Extract banking data via Open Banking""" try: # This would implement Open Banking data extraction logger.warning("Open Banking extraction not implemented") return {"error": "Not implemented", "records": []} except Exception as e: logger.error("Banking data extraction failed", error=str(e)) return {"error": str(e), "records": []} async def _cleanup_session_after_timeout(session_id: str, timeout_seconds: int) -> None: """Cleanup session after timeout""" await asyncio.sleep(timeout_seconds) if session_id in active_sessions: try: session = active_sessions[session_id] await session["context"].close() del active_sessions[session_id] logger.info("Session cleaned up due to timeout", session_id=session_id) except Exception as e: logger.error( "Failed to cleanup session", session_id=session_id, error=str(e) ) @app.exception_handler(HTTPException) async def http_exception_handler(request: Request, exc: HTTPException) -> JSONResponse: """Handle HTTP exceptions with RFC7807 format""" return JSONResponse( status_code=exc.status_code, content=ErrorResponse( type=f"https://httpstatuses.com/{exc.status_code}", title=exc.detail, status=exc.status_code, detail=exc.detail, instance=str(request.url), trace_id="", ).model_dump(), ) if __name__ == "__main__": import uvicorn uvicorn.run("main:app", host="0.0.0.0", port=8001, reload=True, log_config=None)