Initial commit
Some checks failed
CI/CD Pipeline / Code Quality & Linting (push) Has been cancelled
CI/CD Pipeline / Policy Validation (push) Has been cancelled
CI/CD Pipeline / Test Suite (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-firm-connectors) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-forms) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-hmrc) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ingestion) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-normalize-map) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ocr) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-indexer) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-reason) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rpa) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (ui-review) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (ui-review) (push) Has been cancelled
CI/CD Pipeline / Generate SBOM (push) Has been cancelled
CI/CD Pipeline / Deploy to Staging (push) Has been cancelled
CI/CD Pipeline / Deploy to Production (push) Has been cancelled
CI/CD Pipeline / Notifications (push) Has been cancelled
Some checks failed
CI/CD Pipeline / Code Quality & Linting (push) Has been cancelled
CI/CD Pipeline / Policy Validation (push) Has been cancelled
CI/CD Pipeline / Test Suite (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-firm-connectors) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-forms) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-hmrc) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ingestion) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-normalize-map) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ocr) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-indexer) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-reason) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rpa) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (ui-review) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (ui-review) (push) Has been cancelled
CI/CD Pipeline / Generate SBOM (push) Has been cancelled
CI/CD Pipeline / Deploy to Staging (push) Has been cancelled
CI/CD Pipeline / Deploy to Production (push) Has been cancelled
CI/CD Pipeline / Notifications (push) Has been cancelled
This commit is contained in:
4
apps/__init__.py
Normal file
4
apps/__init__.py
Normal file
@@ -0,0 +1,4 @@
|
||||
# file: /Users/harris/Projects/ai-tax-agent/apps/__init__.py
|
||||
# hypothesis_version: 6.138.15
|
||||
|
||||
[]
|
||||
53
apps/svc_coverage/Dockerfile
Normal file
53
apps/svc_coverage/Dockerfile
Normal file
@@ -0,0 +1,53 @@
|
||||
# Multi-stage build for svc-coverage
|
||||
FROM python:3.12-slim AS builder
|
||||
|
||||
# Install build dependencies
|
||||
RUN apt-get update && apt-get install -y \
|
||||
build-essential \
|
||||
curl \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Create virtual environment
|
||||
RUN python -m venv /opt/venv
|
||||
ENV PATH="/opt/venv/bin:$PATH"
|
||||
|
||||
# Copy requirements and install dependencies
|
||||
COPY libs/requirements-base.txt /tmp/libs-requirements.txt
|
||||
COPY apps/svc_coverage/requirements.txt /tmp/requirements.txt
|
||||
RUN pip install --no-cache-dir --upgrade pip && \
|
||||
pip install --no-cache-dir -r /tmp/libs-requirements.txt -r /tmp/requirements.txt
|
||||
|
||||
# Production stage
|
||||
FROM python:3.12-slim
|
||||
|
||||
# Install runtime dependencies
|
||||
RUN apt-get update && apt-get install -y \
|
||||
curl \
|
||||
&& rm -rf /var/lib/apt/lists/* \
|
||||
&& groupadd -r appuser \
|
||||
&& useradd -r -g appuser appuser
|
||||
|
||||
# Copy virtual environment from builder
|
||||
COPY --from=builder /opt/venv /opt/venv
|
||||
ENV PATH="/opt/venv/bin:$PATH"
|
||||
|
||||
# Set working directory
|
||||
WORKDIR /app
|
||||
|
||||
# Copy application code
|
||||
COPY libs/ ./libs/
|
||||
COPY apps/svc_coverage/ ./apps/svc_coverage/
|
||||
|
||||
# Create non-root user and set permissions
|
||||
RUN chown -R appuser:appuser /app
|
||||
USER appuser
|
||||
|
||||
# Health check
|
||||
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
|
||||
CMD curl -f http://localhost:8000/healthz || exit 1
|
||||
|
||||
# Expose port
|
||||
EXPOSE 8000
|
||||
|
||||
# Run the application
|
||||
CMD ["python", "-m", "uvicorn", "apps.svc_coverage.main:app", "--host", "0.0.0.0", "--port", "8000"]
|
||||
1
apps/svc_coverage/__init__.py
Normal file
1
apps/svc_coverage/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
"""Coverage service package."""
|
||||
112
apps/svc_coverage/alembic.ini
Normal file
112
apps/svc_coverage/alembic.ini
Normal file
@@ -0,0 +1,112 @@
|
||||
# A generic, single database configuration.
|
||||
|
||||
[alembic]
|
||||
# path to migration scripts
|
||||
script_location = alembic
|
||||
|
||||
# template used to generate migration file names; The default value is %%(rev)s_%%(slug)s
|
||||
# Uncomment the line below if you want the files to be prepended with date and time
|
||||
# file_template = %%(year)d_%%(month).2d_%%(day).2d_%%(hour).2d%%(minute).2d-%%(rev)s_%%(slug)s
|
||||
|
||||
# sys.path path, will be prepended to sys.path if present.
|
||||
# defaults to the current working directory.
|
||||
prepend_sys_path = .
|
||||
|
||||
# timezone to use when rendering the date within the migration file
|
||||
# as well as the filename.
|
||||
# If specified, requires the python-dateutil library that can be
|
||||
# installed by adding `alembic[tz]` to the pip requirements
|
||||
# string value is passed to dateutil.tz.gettz()
|
||||
# leave blank for localtime
|
||||
# timezone =
|
||||
|
||||
# max length of characters to apply to the
|
||||
# "slug" field
|
||||
# truncate_slug_length = 40
|
||||
|
||||
# set to 'true' to run the environment during
|
||||
# the 'revision' command, regardless of autogenerate
|
||||
# revision_environment = false
|
||||
|
||||
# set to 'true' to allow .pyc and .pyo files without
|
||||
# a source .py file to be detected as revisions in the
|
||||
# versions/ directory
|
||||
# sourceless = false
|
||||
|
||||
# version number format
|
||||
version_num_format = %04d
|
||||
|
||||
# version path separator; As mentioned above, this is the character used to split
|
||||
# version_locations. The default within new alembic.ini files is "os", which uses
|
||||
# os.pathsep. If this key is omitted entirely, it falls back to the legacy
|
||||
# behavior of splitting on spaces and/or commas.
|
||||
# Valid values for version_path_separator are:
|
||||
#
|
||||
# version_path_separator = :
|
||||
# version_path_separator = ;
|
||||
# version_path_separator = space
|
||||
version_path_separator = os
|
||||
|
||||
# set to 'true' to search source files recursively
|
||||
# in each "version_locations" directory
|
||||
# new in Alembic version 1.10
|
||||
# recursive_version_locations = false
|
||||
|
||||
# the output encoding used when revision files
|
||||
# are written from script.py.mako
|
||||
# output_encoding = utf-8
|
||||
|
||||
sqlalchemy.url = postgresql://user:pass@localhost:5432/coverage
|
||||
|
||||
|
||||
[post_write_hooks]
|
||||
# post_write_hooks defines scripts or Python functions that are run
|
||||
# on newly generated revision scripts. See the documentation for further
|
||||
# detail and examples
|
||||
|
||||
# format using "black" - use the console_scripts runner, against the "black" entrypoint
|
||||
# hooks = black
|
||||
# black.type = console_scripts
|
||||
# black.entrypoint = black
|
||||
# black.options = -l 79 REVISION_SCRIPT_FILENAME
|
||||
|
||||
# lint with attempts to fix using "ruff" - use the exec runner, execute a binary
|
||||
# hooks = ruff
|
||||
# ruff.type = exec
|
||||
# ruff.executable = %(here)s/.venv/bin/ruff
|
||||
# ruff.options = --fix REVISION_SCRIPT_FILENAME
|
||||
|
||||
# Logging configuration
|
||||
[loggers]
|
||||
keys = root,sqlalchemy,alembic
|
||||
|
||||
[handlers]
|
||||
keys = console
|
||||
|
||||
[formatters]
|
||||
keys = generic
|
||||
|
||||
[logger_root]
|
||||
level = WARN
|
||||
handlers = console
|
||||
qualname =
|
||||
|
||||
[logger_sqlalchemy]
|
||||
level = WARN
|
||||
handlers =
|
||||
qualname = sqlalchemy.engine
|
||||
|
||||
[logger_alembic]
|
||||
level = INFO
|
||||
handlers =
|
||||
qualname = alembic
|
||||
|
||||
[handler_console]
|
||||
class = StreamHandler
|
||||
args = (sys.stderr,)
|
||||
level = NOTSET
|
||||
formatter = generic
|
||||
|
||||
[formatter_generic]
|
||||
format = %(levelname)-5.5s [%(name)s] %(message)s
|
||||
datefmt = %H:%M:%S
|
||||
92
apps/svc_coverage/alembic/env.py
Normal file
92
apps/svc_coverage/alembic/env.py
Normal file
@@ -0,0 +1,92 @@
|
||||
"""Alembic environment configuration for coverage service."""
|
||||
|
||||
import os
|
||||
import sys
|
||||
from logging.config import fileConfig
|
||||
|
||||
from alembic import context
|
||||
from sqlalchemy import engine_from_config, pool
|
||||
|
||||
# Add the parent directory to the path so we can import our models
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__), "..", "..", ".."))
|
||||
|
||||
# Import your models here
|
||||
from apps.svc_coverage.models import Base
|
||||
|
||||
# this is the Alembic Config object, which provides
|
||||
# access to the values within the .ini file in use.
|
||||
config = context.config
|
||||
|
||||
# Interpret the config file for Python logging.
|
||||
# This line sets up loggers basically.
|
||||
if config.config_file_name is not None:
|
||||
fileConfig(config.config_file_name)
|
||||
|
||||
# add your model's MetaData object here
|
||||
# for 'autogenerate' support
|
||||
target_metadata = Base.metadata
|
||||
|
||||
# other values from the config, defined by the needs of env.py,
|
||||
# can be acquired:
|
||||
# my_important_option = config.get_main_option("my_important_option")
|
||||
# ... etc.
|
||||
|
||||
|
||||
def get_url():
|
||||
"""Get database URL from environment or config."""
|
||||
return os.getenv("DATABASE_URL", config.get_main_option("sqlalchemy.url"))
|
||||
|
||||
|
||||
def run_migrations_offline() -> None:
|
||||
"""Run migrations in 'offline' mode.
|
||||
|
||||
This configures the context with just a URL
|
||||
and not an Engine, though an Engine is acceptable
|
||||
here as well. By skipping the Engine creation
|
||||
we don't even need a DBAPI to be available.
|
||||
|
||||
Calls to context.execute() here emit the given string to the
|
||||
script output.
|
||||
|
||||
"""
|
||||
url = get_url()
|
||||
context.configure(
|
||||
url=url,
|
||||
target_metadata=target_metadata,
|
||||
literal_binds=True,
|
||||
dialect_opts={"paramstyle": "named"},
|
||||
)
|
||||
|
||||
with context.begin_transaction():
|
||||
context.run_migrations()
|
||||
|
||||
|
||||
def run_migrations_online() -> None:
|
||||
"""Run migrations in 'online' mode.
|
||||
|
||||
In this scenario we need to create an Engine
|
||||
and associate a connection with the context.
|
||||
|
||||
"""
|
||||
configuration = config.get_section(config.config_ini_section)
|
||||
configuration["sqlalchemy.url"] = get_url()
|
||||
|
||||
connectable = engine_from_config(
|
||||
configuration,
|
||||
prefix="sqlalchemy.",
|
||||
poolclass=pool.NullPool,
|
||||
)
|
||||
|
||||
with connectable.connect() as connection:
|
||||
context.configure(
|
||||
connection=connection, target_metadata=target_metadata
|
||||
)
|
||||
|
||||
with context.begin_transaction():
|
||||
context.run_migrations()
|
||||
|
||||
|
||||
if context.is_offline_mode():
|
||||
run_migrations_offline()
|
||||
else:
|
||||
run_migrations_online()
|
||||
24
apps/svc_coverage/alembic/script.py.mako
Normal file
24
apps/svc_coverage/alembic/script.py.mako
Normal file
@@ -0,0 +1,24 @@
|
||||
"""${message}
|
||||
|
||||
Revision ID: ${up_revision}
|
||||
Revises: ${down_revision | comma,n}
|
||||
Create Date: ${create_date}
|
||||
|
||||
"""
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
${imports if imports else ""}
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = ${repr(up_revision)}
|
||||
down_revision = ${repr(down_revision)}
|
||||
branch_labels = ${repr(branch_labels)}
|
||||
depends_on = ${repr(depends_on)}
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
${upgrades if upgrades else "pass"}
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
${downgrades if downgrades else "pass"}
|
||||
@@ -0,0 +1,76 @@
|
||||
"""Initial coverage tables
|
||||
|
||||
Revision ID: 0001
|
||||
Revises:
|
||||
Create Date: 2024-09-14 12:00:00.000000
|
||||
|
||||
"""
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
from sqlalchemy.dialects import postgresql
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = '0001'
|
||||
down_revision = None
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
# Create coverage_versions table
|
||||
op.create_table(
|
||||
'coverage_versions',
|
||||
sa.Column('id', sa.Integer(), autoincrement=True, nullable=False),
|
||||
sa.Column('version', sa.String(length=50), nullable=False),
|
||||
sa.Column('jurisdiction', sa.String(length=10), nullable=False),
|
||||
sa.Column('tax_year', sa.String(length=10), nullable=False),
|
||||
sa.Column('tenant_id', sa.String(length=100), nullable=True),
|
||||
sa.Column('source_files', postgresql.JSON(astext_type=sa.Text()), nullable=False),
|
||||
sa.Column('compiled_at', sa.DateTime(), nullable=False),
|
||||
sa.Column('hash', sa.String(length=64), nullable=False),
|
||||
sa.PrimaryKeyConstraint('id')
|
||||
)
|
||||
|
||||
# Create indexes for coverage_versions
|
||||
op.create_index('ix_coverage_versions_version', 'coverage_versions', ['version'])
|
||||
op.create_index('ix_coverage_versions_jurisdiction_tax_year', 'coverage_versions', ['jurisdiction', 'tax_year'])
|
||||
op.create_index('ix_coverage_versions_tenant_id', 'coverage_versions', ['tenant_id'])
|
||||
op.create_index('ix_coverage_versions_hash', 'coverage_versions', ['hash'])
|
||||
|
||||
# Create coverage_audit table
|
||||
op.create_table(
|
||||
'coverage_audit',
|
||||
sa.Column('id', sa.Integer(), autoincrement=True, nullable=False),
|
||||
sa.Column('taxpayer_id', sa.String(length=100), nullable=False),
|
||||
sa.Column('tax_year', sa.String(length=10), nullable=False),
|
||||
sa.Column('policy_version', sa.String(length=50), nullable=False),
|
||||
sa.Column('overall_status', sa.String(length=20), nullable=False),
|
||||
sa.Column('blocking_items', postgresql.JSON(astext_type=sa.Text()), nullable=False),
|
||||
sa.Column('created_at', sa.DateTime(), nullable=False),
|
||||
sa.Column('trace_id', sa.String(length=100), nullable=True),
|
||||
sa.PrimaryKeyConstraint('id')
|
||||
)
|
||||
|
||||
# Create indexes for coverage_audit
|
||||
op.create_index('ix_coverage_audit_taxpayer_id', 'coverage_audit', ['taxpayer_id'])
|
||||
op.create_index('ix_coverage_audit_tax_year', 'coverage_audit', ['tax_year'])
|
||||
op.create_index('ix_coverage_audit_taxpayer_tax_year', 'coverage_audit', ['taxpayer_id', 'tax_year'])
|
||||
op.create_index('ix_coverage_audit_created_at', 'coverage_audit', ['created_at'])
|
||||
op.create_index('ix_coverage_audit_trace_id', 'coverage_audit', ['trace_id'])
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
# Drop coverage_audit table and indexes
|
||||
op.drop_index('ix_coverage_audit_trace_id', table_name='coverage_audit')
|
||||
op.drop_index('ix_coverage_audit_created_at', table_name='coverage_audit')
|
||||
op.drop_index('ix_coverage_audit_taxpayer_tax_year', table_name='coverage_audit')
|
||||
op.drop_index('ix_coverage_audit_tax_year', table_name='coverage_audit')
|
||||
op.drop_index('ix_coverage_audit_taxpayer_id', table_name='coverage_audit')
|
||||
op.drop_table('coverage_audit')
|
||||
|
||||
# Drop coverage_versions table and indexes
|
||||
op.drop_index('ix_coverage_versions_hash', table_name='coverage_versions')
|
||||
op.drop_index('ix_coverage_versions_tenant_id', table_name='coverage_versions')
|
||||
op.drop_index('ix_coverage_versions_jurisdiction_tax_year', table_name='coverage_versions')
|
||||
op.drop_index('ix_coverage_versions_version', table_name='coverage_versions')
|
||||
op.drop_table('coverage_versions')
|
||||
523
apps/svc_coverage/main.py
Normal file
523
apps/svc_coverage/main.py
Normal file
@@ -0,0 +1,523 @@
|
||||
# FILE: apps/svc-coverage/main.py
|
||||
|
||||
# Coverage policy service with evaluation, clarification, and hot reload
|
||||
|
||||
import os
|
||||
import sys
|
||||
from typing import Any
|
||||
|
||||
import structlog
|
||||
from fastapi import Depends, HTTPException
|
||||
from pydantic import BaseModel
|
||||
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
|
||||
|
||||
from libs.app_factory import create_app
|
||||
from libs.config import BaseAppSettings, create_event_bus, create_neo4j_client
|
||||
from libs.coverage import CoverageEvaluator
|
||||
from libs.events import EventBus
|
||||
from libs.neo import Neo4jClient
|
||||
from libs.observability import get_metrics, get_tracer, setup_observability
|
||||
from libs.policy import PolicyLoader, get_policy_loader
|
||||
from libs.schemas import (
|
||||
ClarifyContext,
|
||||
ClarifyResponse,
|
||||
CoverageGap,
|
||||
CoverageReport,
|
||||
PolicyError,
|
||||
UploadOption,
|
||||
ValidationResult,
|
||||
)
|
||||
from libs.security import get_current_user, get_tenant_id
|
||||
|
||||
logger = structlog.get_logger()
|
||||
|
||||
|
||||
async def http_exception_handler(_request, exc) -> dict[str, str | int]:
|
||||
"""Handle HTTP exceptions"""
|
||||
return {"detail": exc.detail, "status_code": exc.status_code}
|
||||
|
||||
|
||||
class CoverageSettings(BaseAppSettings):
|
||||
"""Settings for Coverage service"""
|
||||
|
||||
service_name: str = "svc-coverage"
|
||||
|
||||
# Policy configuration
|
||||
config_dir: str = "config"
|
||||
policy_reload_enabled: bool = True
|
||||
|
||||
# Database
|
||||
postgres_url: str = "postgresql://user:pass@localhost:5432/coverage"
|
||||
|
||||
# External services
|
||||
rag_service_url: str = "http://svc-rag-retriever:8000"
|
||||
|
||||
|
||||
# Create app and settings
|
||||
app, settings = create_app(
|
||||
service_name="svc-coverage",
|
||||
title="Tax Agent Coverage Policy Service",
|
||||
description="Coverage policy evaluation and clarification service",
|
||||
settings_class=CoverageSettings,
|
||||
)
|
||||
|
||||
# Global state
|
||||
neo4j_client: Neo4jClient | None = None
|
||||
event_bus: EventBus | None = None
|
||||
policy_loader: PolicyLoader | None = None
|
||||
current_policy: Any = None
|
||||
|
||||
|
||||
@app.on_event("startup")
|
||||
async def startup_event() -> None:
|
||||
"""Initialize service dependencies"""
|
||||
global neo4j_client, event_bus, policy_loader, current_policy
|
||||
|
||||
# Setup observability
|
||||
setup_observability(settings)
|
||||
|
||||
# Initialize Neo4j client
|
||||
neo4j_driver = create_neo4j_client(settings)
|
||||
neo4j_client = Neo4jClient(neo4j_driver)
|
||||
|
||||
# Initialize event bus
|
||||
event_bus = create_event_bus(settings)
|
||||
|
||||
# Initialize policy loader
|
||||
policy_loader = get_policy_loader(settings.config_dir)
|
||||
|
||||
# Load initial policy
|
||||
try:
|
||||
policy = policy_loader.load_policy()
|
||||
current_policy = policy_loader.compile_predicates(policy)
|
||||
logger.info("Initial policy loaded", version=policy.version)
|
||||
except Exception as e:
|
||||
logger.error("Failed to load initial policy", error=str(e))
|
||||
current_policy = None
|
||||
|
||||
logger.info("Coverage service started")
|
||||
|
||||
|
||||
@app.on_event("shutdown")
|
||||
async def shutdown_event() -> None:
|
||||
"""Cleanup service dependencies"""
|
||||
global neo4j_client, event_bus
|
||||
|
||||
if neo4j_client:
|
||||
await neo4j_client.close()
|
||||
|
||||
if event_bus:
|
||||
await event_bus.close()
|
||||
|
||||
logger.info("Coverage service stopped")
|
||||
|
||||
|
||||
# Request/Response models
|
||||
class CheckCoverageRequest(BaseModel):
|
||||
"""Request to check document coverage"""
|
||||
|
||||
tax_year: str
|
||||
taxpayer_id: str
|
||||
|
||||
|
||||
class ClarifyRequest(BaseModel):
|
||||
"""Request to generate clarifying question"""
|
||||
|
||||
gap: CoverageGap
|
||||
context: ClarifyContext
|
||||
|
||||
|
||||
class ReloadRequest(BaseModel):
|
||||
"""Request to reload policy"""
|
||||
|
||||
force: bool = False
|
||||
|
||||
|
||||
# Metrics
|
||||
metrics = get_metrics()
|
||||
tracer = get_tracer()
|
||||
|
||||
|
||||
@app.post("/v1/coverage/check")
|
||||
async def check_coverage(
|
||||
request: CheckCoverageRequest,
|
||||
current_user: dict[str, Any] = Depends(get_current_user),
|
||||
tenant_id: str = Depends(get_tenant_id),
|
||||
) -> CoverageReport:
|
||||
"""Check document coverage for taxpayer"""
|
||||
|
||||
with tracer.start_as_current_span("check_coverage") as span:
|
||||
span.set_attribute("taxpayer_id", request.taxpayer_id)
|
||||
span.set_attribute("tax_year", request.tax_year)
|
||||
span.set_attribute("tenant_id", tenant_id)
|
||||
|
||||
try:
|
||||
if not current_policy:
|
||||
raise HTTPException(status_code=503, detail="Policy not loaded")
|
||||
|
||||
# Create evaluator with KG and RAG clients
|
||||
evaluator = CoverageEvaluator(
|
||||
kg_client=neo4j_client,
|
||||
rag_client=None, # TODO: Initialize RAG client
|
||||
)
|
||||
|
||||
# Perform coverage evaluation
|
||||
report = await evaluator.check_document_coverage(
|
||||
request.taxpayer_id,
|
||||
request.tax_year,
|
||||
current_policy,
|
||||
)
|
||||
|
||||
# Record audit trail
|
||||
await _record_coverage_audit(report, tenant_id)
|
||||
|
||||
# Update metrics
|
||||
metrics.counter("coverage_checks_total").labels(
|
||||
tenant_id=tenant_id,
|
||||
tax_year=request.tax_year,
|
||||
overall_status=report.overall_status.value,
|
||||
).inc()
|
||||
|
||||
return report
|
||||
|
||||
except HTTPException:
|
||||
# Re-raise HTTP exceptions as-is
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
"Coverage check failed",
|
||||
taxpayer_id=request.taxpayer_id,
|
||||
tax_year=request.tax_year,
|
||||
error=str(e),
|
||||
)
|
||||
raise HTTPException(
|
||||
status_code=500, detail=f"Coverage check failed: {str(e)}"
|
||||
) from e
|
||||
|
||||
|
||||
@app.post("/v1/coverage/clarify")
|
||||
async def clarify_gap(
|
||||
request: ClarifyRequest,
|
||||
current_user: dict[str, Any] = Depends(get_current_user),
|
||||
tenant_id: str = Depends(get_tenant_id),
|
||||
) -> ClarifyResponse:
|
||||
"""Generate clarifying question for coverage gap"""
|
||||
|
||||
with tracer.start_as_current_span("clarify_gap") as span:
|
||||
span.set_attribute("schedule_id", request.gap.schedule_id)
|
||||
span.set_attribute("evidence_id", request.gap.evidence_id)
|
||||
span.set_attribute("tenant_id", tenant_id)
|
||||
|
||||
try:
|
||||
if not current_policy:
|
||||
raise HTTPException(status_code=503, detail="Policy not loaded")
|
||||
|
||||
# Generate clarifying question
|
||||
response = await _generate_clarifying_question(request.gap, request.context)
|
||||
|
||||
# Update metrics
|
||||
metrics.counter("clarifications_total").labels(
|
||||
tenant_id=tenant_id,
|
||||
schedule_id=request.gap.schedule_id,
|
||||
evidence_id=request.gap.evidence_id,
|
||||
).inc()
|
||||
|
||||
return response
|
||||
|
||||
except HTTPException:
|
||||
# Re-raise HTTP exceptions as-is
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
"Clarification failed",
|
||||
gap=request.gap.dict(),
|
||||
error=str(e),
|
||||
)
|
||||
raise HTTPException(
|
||||
status_code=500, detail=f"Clarification failed: {str(e)}"
|
||||
) from e
|
||||
|
||||
|
||||
@app.post("/admin/coverage/reload")
|
||||
async def reload_policy(
|
||||
request: ReloadRequest,
|
||||
current_user: dict[str, Any] = Depends(get_current_user),
|
||||
tenant_id: str = Depends(get_tenant_id),
|
||||
) -> dict[str, Any]:
|
||||
"""Reload coverage policy from files"""
|
||||
|
||||
# Check admin permissions
|
||||
user_groups = current_user.get("groups", [])
|
||||
if "admin" not in user_groups:
|
||||
raise HTTPException(status_code=403, detail="Admin access required")
|
||||
|
||||
with tracer.start_as_current_span("reload_policy") as span:
|
||||
span.set_attribute("tenant_id", tenant_id)
|
||||
span.set_attribute("force", request.force)
|
||||
|
||||
try:
|
||||
global current_policy
|
||||
|
||||
if not policy_loader:
|
||||
raise HTTPException(
|
||||
status_code=503, detail="Policy loader not initialized"
|
||||
)
|
||||
|
||||
# Load and compile new policy
|
||||
policy = policy_loader.load_policy()
|
||||
new_compiled_policy = policy_loader.compile_predicates(policy)
|
||||
|
||||
# Record new policy version
|
||||
await _record_policy_version(new_compiled_policy, tenant_id)
|
||||
|
||||
# Update current policy
|
||||
current_policy = new_compiled_policy
|
||||
|
||||
logger.info(
|
||||
"Policy reloaded",
|
||||
version=policy.version,
|
||||
hash=new_compiled_policy.hash,
|
||||
tenant_id=tenant_id,
|
||||
)
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"version": policy.version,
|
||||
"hash": new_compiled_policy.hash,
|
||||
"compiled_at": new_compiled_policy.compiled_at.isoformat(),
|
||||
"source_files": new_compiled_policy.source_files,
|
||||
}
|
||||
|
||||
except PolicyError as e:
|
||||
logger.error("Policy reload failed", error=str(e))
|
||||
raise HTTPException(
|
||||
status_code=400, detail=f"Policy error: {str(e)}"
|
||||
) from e
|
||||
except Exception as e:
|
||||
logger.error("Policy reload failed", error=str(e))
|
||||
raise HTTPException(
|
||||
status_code=500, detail=f"Reload failed: {str(e)}"
|
||||
) from e
|
||||
|
||||
|
||||
@app.get("/v1/coverage/policy")
|
||||
async def get_current_policy(
|
||||
current_user: dict[str, Any] = Depends(get_current_user),
|
||||
tenant_id: str = Depends(get_tenant_id),
|
||||
) -> dict[str, Any]:
|
||||
"""Get current compiled policy (no secrets, no PII)"""
|
||||
|
||||
with tracer.start_as_current_span("get_policy") as span:
|
||||
span.set_attribute("tenant_id", tenant_id)
|
||||
|
||||
if not current_policy:
|
||||
raise HTTPException(status_code=503, detail="Policy not loaded")
|
||||
|
||||
# Return sanitized policy info
|
||||
return {
|
||||
"version": current_policy.policy.version,
|
||||
"jurisdiction": current_policy.policy.jurisdiction,
|
||||
"tax_year": current_policy.policy.tax_year,
|
||||
"compiled_at": current_policy.compiled_at.isoformat(),
|
||||
"hash": current_policy.hash,
|
||||
"source_files": current_policy.source_files,
|
||||
"schedules": list(current_policy.policy.schedules.keys()),
|
||||
"document_kinds": current_policy.policy.document_kinds,
|
||||
}
|
||||
|
||||
|
||||
@app.get("/v1/coverage/validate")
|
||||
async def validate_policy(
|
||||
current_user: dict[str, Any] = Depends(get_current_user),
|
||||
tenant_id: str = Depends(get_tenant_id),
|
||||
) -> ValidationResult:
|
||||
"""Validate current policy configuration"""
|
||||
|
||||
with tracer.start_as_current_span("validate_policy") as span:
|
||||
span.set_attribute("tenant_id", tenant_id)
|
||||
|
||||
try:
|
||||
if not policy_loader:
|
||||
raise HTTPException(
|
||||
status_code=503, detail="Policy loader not initialized"
|
||||
)
|
||||
|
||||
# Load policy as dict for validation
|
||||
policy_dict = policy_loader._load_yaml_file(
|
||||
os.path.join(settings.config_dir, "coverage.yaml")
|
||||
)
|
||||
|
||||
# Validate policy
|
||||
result = policy_loader.validate_policy(policy_dict)
|
||||
|
||||
# Additional validation: check box existence in KG
|
||||
if neo4j_client and result.ok:
|
||||
box_validation_errors = await _validate_boxes_in_kg(policy_dict)
|
||||
if box_validation_errors:
|
||||
result.errors.extend(box_validation_errors)
|
||||
result.ok = False
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Policy validation failed", error=str(e))
|
||||
return ValidationResult(
|
||||
ok=False,
|
||||
errors=[f"Validation failed: {str(e)}"],
|
||||
)
|
||||
|
||||
|
||||
# Helper functions
|
||||
|
||||
|
||||
async def _record_coverage_audit(report: CoverageReport, tenant_id: str) -> None:
|
||||
"""Record coverage audit trail"""
|
||||
# TODO: Implement database recording
|
||||
logger.info(
|
||||
"Coverage audit recorded",
|
||||
taxpayer_id=report.taxpayer_id,
|
||||
tax_year=report.tax_year,
|
||||
overall_status=report.overall_status.value,
|
||||
blocking_items=len(report.blocking_items),
|
||||
tenant_id=tenant_id,
|
||||
)
|
||||
|
||||
|
||||
async def _record_policy_version(compiled_policy: Any, tenant_id: str) -> None:
|
||||
"""Record new policy version"""
|
||||
# TODO: Implement database recording
|
||||
logger.info(
|
||||
"Policy version recorded",
|
||||
version=compiled_policy.policy.version,
|
||||
hash=compiled_policy.hash,
|
||||
tenant_id=tenant_id,
|
||||
)
|
||||
|
||||
|
||||
async def _generate_clarifying_question(
|
||||
gap: CoverageGap, context: ClarifyContext
|
||||
) -> ClarifyResponse:
|
||||
"""Generate clarifying question for coverage gap"""
|
||||
|
||||
if not current_policy:
|
||||
raise ValueError("Policy not loaded")
|
||||
|
||||
# Get question template
|
||||
templates = current_policy.policy.question_templates
|
||||
default_template = templates.default
|
||||
|
||||
# Build question text
|
||||
evidence_name = gap.evidence_id
|
||||
schedule_name = gap.schedule_id
|
||||
boxes_text = ", ".join(gap.boxes) if gap.boxes else "relevant boxes"
|
||||
alternatives_text = (
|
||||
", ".join(gap.acceptable_alternatives)
|
||||
if gap.acceptable_alternatives
|
||||
else "alternative documents"
|
||||
)
|
||||
|
||||
question_text = default_template["text"].format(
|
||||
schedule=schedule_name,
|
||||
tax_year=context.tax_year,
|
||||
evidence=evidence_name,
|
||||
boxes=boxes_text,
|
||||
alternatives=alternatives_text,
|
||||
)
|
||||
|
||||
why_text = default_template["why"].format(
|
||||
why=gap.reason,
|
||||
guidance_doc="policy guidance",
|
||||
)
|
||||
|
||||
# Build upload options
|
||||
options = []
|
||||
if gap.acceptable_alternatives:
|
||||
for alt in gap.acceptable_alternatives:
|
||||
options.append(
|
||||
UploadOption(
|
||||
label=f"Upload {alt} (PDF/CSV)",
|
||||
accepted_formats=["pdf", "csv"],
|
||||
upload_endpoint=f"/v1/ingest/upload?tag={alt}",
|
||||
)
|
||||
)
|
||||
else:
|
||||
options.append(
|
||||
UploadOption(
|
||||
label=f"Upload {evidence_name} (PDF/CSV)",
|
||||
accepted_formats=["pdf", "csv"],
|
||||
upload_endpoint=f"/v1/ingest/upload?tag={evidence_name}",
|
||||
)
|
||||
)
|
||||
|
||||
return ClarifyResponse(
|
||||
question_text=question_text,
|
||||
why_it_is_needed=why_text,
|
||||
citations=gap.citations,
|
||||
options_to_provide=options,
|
||||
blocking=(gap.role.value == "REQUIRED"),
|
||||
boxes_affected=gap.boxes,
|
||||
)
|
||||
|
||||
|
||||
async def _validate_boxes_in_kg(policy_dict: dict[str, Any]) -> list[str]:
|
||||
"""Validate that all referenced boxes exist in KG"""
|
||||
|
||||
if not neo4j_client:
|
||||
return ["KG client not available for box validation"]
|
||||
|
||||
errors = []
|
||||
all_boxes = set()
|
||||
|
||||
# Collect all box references
|
||||
for schedule in policy_dict.get("schedules", {}).values():
|
||||
for evidence in schedule.get("evidence", []):
|
||||
all_boxes.update(evidence.get("boxes", []))
|
||||
|
||||
if all_boxes:
|
||||
try:
|
||||
from libs.neo import kg_boxes_exist
|
||||
|
||||
box_existence = await kg_boxes_exist(neo4j_client, list(all_boxes))
|
||||
|
||||
for box_id, exists in box_existence.items():
|
||||
if not exists:
|
||||
errors.append(f"Form box '{box_id}' not found in knowledge graph")
|
||||
|
||||
except Exception as e:
|
||||
errors.append(f"Failed to validate boxes in KG: {str(e)}")
|
||||
|
||||
return errors
|
||||
|
||||
|
||||
# Health check endpoints
|
||||
@app.get("/healthz")
|
||||
async def health_check() -> dict[str, str]:
|
||||
"""Health check endpoint"""
|
||||
return {"status": "healthy", "service": "svc-coverage"}
|
||||
|
||||
|
||||
@app.get("/readyz")
|
||||
async def readiness_check() -> dict[str, str]:
|
||||
"""Readiness check endpoint"""
|
||||
return {"status": "ready", "service": "svc-coverage"}
|
||||
|
||||
|
||||
@app.get("/livez")
|
||||
async def liveness_check() -> dict[str, str]:
|
||||
"""Liveness check endpoint"""
|
||||
return {"status": "alive", "service": "svc-coverage"}
|
||||
|
||||
|
||||
# Metrics endpoint (internal only)
|
||||
@app.get("/metrics")
|
||||
async def get_metrics_endpoint() -> str:
|
||||
"""Prometheus metrics endpoint"""
|
||||
# This would return Prometheus format metrics
|
||||
return "# Coverage service metrics\n"
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import uvicorn
|
||||
|
||||
uvicorn.run(app, host="0.0.0.0", port=8000)
|
||||
46
apps/svc_coverage/models.py
Normal file
46
apps/svc_coverage/models.py
Normal file
@@ -0,0 +1,46 @@
|
||||
"""Database models for coverage service."""
|
||||
|
||||
# FILE: apps/svc-coverage/models.py
|
||||
|
||||
from datetime import datetime
|
||||
|
||||
from sqlalchemy import JSON, Column, DateTime, Integer, String
|
||||
from sqlalchemy.ext.declarative import declarative_base
|
||||
|
||||
Base = declarative_base()
|
||||
|
||||
|
||||
class CoverageVersion(Base):
|
||||
"""Policy version tracking table"""
|
||||
|
||||
__tablename__ = "coverage_versions"
|
||||
|
||||
id = Column(Integer, primary_key=True, autoincrement=True)
|
||||
version = Column(String(50), nullable=False)
|
||||
jurisdiction = Column(String(10), nullable=False)
|
||||
tax_year = Column(String(10), nullable=False)
|
||||
tenant_id = Column(String(100), nullable=True)
|
||||
source_files = Column(JSON, nullable=False, default=list)
|
||||
compiled_at = Column(DateTime, nullable=False, default=datetime.utcnow)
|
||||
hash = Column(String(64), nullable=False)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"<CoverageVersion(id={self.id}, version='{self.version}', hash='{self.hash[:8]}...')>"
|
||||
|
||||
|
||||
class CoverageAudit(Base):
|
||||
"""Coverage evaluation audit trail"""
|
||||
|
||||
__tablename__ = "coverage_audit"
|
||||
|
||||
id = Column(Integer, primary_key=True, autoincrement=True)
|
||||
taxpayer_id = Column(String(100), nullable=False)
|
||||
tax_year = Column(String(10), nullable=False)
|
||||
policy_version = Column(String(50), nullable=False)
|
||||
overall_status = Column(String(20), nullable=False)
|
||||
blocking_items = Column(JSON, nullable=False, default=list)
|
||||
created_at = Column(DateTime, nullable=False, default=datetime.utcnow)
|
||||
trace_id = Column(String(100), nullable=True)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"<CoverageAudit(id={self.id}, taxpayer_id='{self.taxpayer_id}', status='{self.overall_status}')>"
|
||||
53
apps/svc_extract/Dockerfile
Normal file
53
apps/svc_extract/Dockerfile
Normal file
@@ -0,0 +1,53 @@
|
||||
# Multi-stage build for svc-extract
|
||||
FROM python:3.12-slim AS builder
|
||||
|
||||
# Install build dependencies
|
||||
RUN apt-get update && apt-get install -y \
|
||||
build-essential \
|
||||
curl \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Create virtual environment
|
||||
RUN python -m venv /opt/venv
|
||||
ENV PATH="/opt/venv/bin:$PATH"
|
||||
|
||||
# Copy requirements and install dependencies
|
||||
COPY libs/requirements-base.txt /tmp/libs-requirements.txt
|
||||
COPY apps/svc_extract/requirements.txt /tmp/requirements.txt
|
||||
RUN pip install --no-cache-dir --upgrade pip && \
|
||||
pip install --no-cache-dir -r /tmp/libs-requirements.txt -r /tmp/requirements.txt
|
||||
|
||||
# Production stage
|
||||
FROM python:3.12-slim
|
||||
|
||||
# Install runtime dependencies
|
||||
RUN apt-get update && apt-get install -y \
|
||||
curl \
|
||||
&& rm -rf /var/lib/apt/lists/* \
|
||||
&& groupadd -r appuser \
|
||||
&& useradd -r -g appuser appuser
|
||||
|
||||
# Copy virtual environment from builder
|
||||
COPY --from=builder /opt/venv /opt/venv
|
||||
ENV PATH="/opt/venv/bin:$PATH"
|
||||
|
||||
# Set working directory
|
||||
WORKDIR /app
|
||||
|
||||
# Copy application code
|
||||
COPY libs/ ./libs/
|
||||
COPY apps/svc_extract/ ./apps/svc_extract/
|
||||
|
||||
# Create non-root user and set permissions
|
||||
RUN chown -R appuser:appuser /app
|
||||
USER appuser
|
||||
|
||||
# Health check
|
||||
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
|
||||
CMD curl -f http://localhost:8000/healthz || exit 1
|
||||
|
||||
# Expose port
|
||||
EXPOSE 8000
|
||||
|
||||
# Run the application
|
||||
CMD ["python", "-m", "uvicorn", "apps.svc_extract.main:app", "--host", "0.0.0.0", "--port", "8000"]
|
||||
625
apps/svc_extract/main.py
Normal file
625
apps/svc_extract/main.py
Normal file
@@ -0,0 +1,625 @@
|
||||
"""LLM-based field extraction with confidence scoring and provenance tracking."""
|
||||
|
||||
# FILE: apps/svc-extract/main.py
|
||||
# pylint: disable=wrong-import-position,import-error,too-few-public-methods,global-statement
|
||||
# pylint: disable=global-variable-not-assigned,raise-missing-from,unused-argument
|
||||
# pylint: disable=broad-exception-caught,no-else-return,too-many-arguments,too-many-positional-arguments
|
||||
# pylint: disable=too-many-locals,import-outside-toplevel
|
||||
|
||||
import os
|
||||
|
||||
# Import shared libraries
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from typing import Any
|
||||
|
||||
import structlog
|
||||
import ulid
|
||||
from fastapi import BackgroundTasks, Depends, HTTPException, Request
|
||||
from fastapi.responses import JSONResponse
|
||||
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
|
||||
|
||||
from libs.app_factory import create_app
|
||||
from libs.calibration import ConfidenceCalibrator
|
||||
from libs.config import BaseAppSettings, create_event_bus, create_minio_client
|
||||
from libs.events import EventBus, EventPayload, EventTopics
|
||||
from libs.observability import get_metrics, get_tracer, setup_observability
|
||||
from libs.schemas import ErrorResponse, ExtractionRequest, ExtractionResponse
|
||||
from libs.security import (
|
||||
create_trusted_proxy_middleware,
|
||||
get_current_user,
|
||||
get_tenant_id,
|
||||
)
|
||||
from libs.storage import DocumentStorage, StorageClient
|
||||
|
||||
logger = structlog.get_logger()
|
||||
|
||||
|
||||
class ExtractionSettings(BaseAppSettings):
|
||||
"""Settings for extraction service"""
|
||||
|
||||
service_name: str = "svc-extract"
|
||||
|
||||
# LLM configuration
|
||||
openai_api_key: str = ""
|
||||
model_name: str = "gpt-4"
|
||||
max_tokens: int = 2000
|
||||
temperature: float = 0.1
|
||||
|
||||
# Extraction configuration
|
||||
confidence_threshold: float = 0.7
|
||||
max_retries: int = 3
|
||||
chunk_size: int = 4000
|
||||
|
||||
# Prompt templates
|
||||
extraction_prompt_template: str = """
|
||||
Extract the following fields from this document text:
|
||||
{field_definitions}
|
||||
|
||||
Document text:
|
||||
{document_text}
|
||||
|
||||
Return a JSON object with the extracted fields and confidence scores.
|
||||
"""
|
||||
|
||||
|
||||
# Create app and settings
|
||||
app, settings = create_app(
|
||||
service_name="svc-extract",
|
||||
title="Tax Agent Extraction Service",
|
||||
description="LLM-based field extraction service",
|
||||
settings_class=ExtractionSettings,
|
||||
)
|
||||
|
||||
# Add middleware
|
||||
middleware_factory = create_trusted_proxy_middleware(settings.internal_cidrs)
|
||||
app.add_middleware(middleware_factory)
|
||||
|
||||
# Global clients
|
||||
storage_client: StorageClient | None = None
|
||||
document_storage: DocumentStorage | None = None
|
||||
event_bus: EventBus | None = None
|
||||
confidence_calibrator: ConfidenceCalibrator | None = None
|
||||
tracer = get_tracer("svc-extract")
|
||||
metrics = get_metrics()
|
||||
|
||||
|
||||
@app.on_event("startup")
|
||||
async def startup_event() -> None:
|
||||
"""Initialize service dependencies"""
|
||||
global storage_client, document_storage, event_bus, confidence_calibrator
|
||||
|
||||
logger.info("Starting extraction service")
|
||||
|
||||
# Setup observability
|
||||
setup_observability(settings)
|
||||
|
||||
# Initialize MinIO client
|
||||
minio_client = create_minio_client(settings)
|
||||
storage_client = StorageClient(minio_client)
|
||||
document_storage = DocumentStorage(storage_client)
|
||||
|
||||
# Initialize event bus
|
||||
event_bus = create_event_bus(settings)
|
||||
if not event_bus:
|
||||
raise Exception("Event bus not initialized")
|
||||
|
||||
await event_bus.start()
|
||||
|
||||
# Subscribe to OCR completion events
|
||||
await event_bus.subscribe(EventTopics.DOC_OCR_READY, _handle_ocr_ready)
|
||||
|
||||
# Initialize confidence calibrator
|
||||
confidence_calibrator = ConfidenceCalibrator(method="temperature")
|
||||
|
||||
logger.info("Extraction service started successfully")
|
||||
|
||||
|
||||
@app.on_event("shutdown")
|
||||
async def shutdown_event() -> None:
|
||||
"""Cleanup service dependencies"""
|
||||
global event_bus
|
||||
|
||||
logger.info("Shutting down extraction service")
|
||||
|
||||
if event_bus:
|
||||
await event_bus.stop()
|
||||
|
||||
logger.info("Extraction service shutdown complete")
|
||||
|
||||
|
||||
@app.get("/healthz")
|
||||
async def health_check() -> dict[str, Any]:
|
||||
"""Health check endpoint"""
|
||||
return {
|
||||
"status": "healthy",
|
||||
"service": settings.service_name,
|
||||
"version": settings.service_version,
|
||||
"timestamp": datetime.utcnow().isoformat(),
|
||||
}
|
||||
|
||||
|
||||
@app.get("/readyz")
|
||||
async def readiness_check() -> dict[str, Any]:
|
||||
"""Readiness check endpoint"""
|
||||
return {
|
||||
"status": "ready",
|
||||
"service": settings.service_name,
|
||||
"version": settings.service_version,
|
||||
"timestamp": datetime.utcnow().isoformat(),
|
||||
}
|
||||
|
||||
|
||||
@app.get("/livez")
|
||||
async def liveness_check() -> dict[str, Any]:
|
||||
"""Liveness check endpoint"""
|
||||
return {
|
||||
"status": "alive",
|
||||
"service": settings.service_name,
|
||||
"version": settings.service_version,
|
||||
"timestamp": datetime.utcnow().isoformat(),
|
||||
}
|
||||
|
||||
|
||||
@app.post("/extract/{doc_id}", response_model=ExtractionResponse)
|
||||
async def extract_fields(
|
||||
doc_id: str,
|
||||
request_data: ExtractionRequest,
|
||||
background_tasks: BackgroundTasks,
|
||||
current_user: dict[str, Any] = Depends(get_current_user()),
|
||||
tenant_id: str = Depends(get_tenant_id()),
|
||||
) -> ExtractionResponse:
|
||||
"""Extract fields from document"""
|
||||
|
||||
with tracer.start_as_current_span("extract_fields") as span:
|
||||
span.set_attribute("doc_id", doc_id)
|
||||
span.set_attribute("tenant_id", tenant_id)
|
||||
span.set_attribute("strategy", request_data.strategy)
|
||||
|
||||
try:
|
||||
# Check if OCR results exist
|
||||
ocr_results = (
|
||||
await document_storage.get_ocr_result(tenant_id, doc_id)
|
||||
if document_storage
|
||||
else None
|
||||
)
|
||||
if not ocr_results:
|
||||
raise HTTPException(status_code=404, detail="OCR results not found")
|
||||
|
||||
# Generate extraction ID
|
||||
extraction_id = str(ulid.new())
|
||||
span.set_attribute("extraction_id", extraction_id)
|
||||
|
||||
# Start background extraction
|
||||
background_tasks.add_task(
|
||||
_extract_fields_async,
|
||||
doc_id,
|
||||
tenant_id,
|
||||
ocr_results,
|
||||
request_data.strategy,
|
||||
extraction_id,
|
||||
current_user.get("sub", "system"),
|
||||
)
|
||||
|
||||
logger.info(
|
||||
"Field extraction started", doc_id=doc_id, extraction_id=extraction_id
|
||||
)
|
||||
|
||||
return ExtractionResponse(
|
||||
extraction_id=extraction_id,
|
||||
confidence=0.0, # Will be updated when processing completes
|
||||
extracted_fields={},
|
||||
provenance=[],
|
||||
)
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error("Failed to start extraction", doc_id=doc_id, error=str(e))
|
||||
raise HTTPException(status_code=500, detail="Failed to start extraction")
|
||||
|
||||
|
||||
@app.get("/results/{doc_id}")
|
||||
async def get_extraction_results(
|
||||
doc_id: str,
|
||||
current_user: dict[str, Any] = Depends(get_current_user()),
|
||||
tenant_id: str = Depends(get_tenant_id()),
|
||||
) -> ExtractionResponse:
|
||||
"""Get extraction results for document"""
|
||||
|
||||
with tracer.start_as_current_span("get_extraction_results") as span:
|
||||
span.set_attribute("doc_id", doc_id)
|
||||
span.set_attribute("tenant_id", tenant_id)
|
||||
|
||||
try:
|
||||
# Get extraction results from storage
|
||||
extraction_results = (
|
||||
await document_storage.get_extraction_result(tenant_id, doc_id)
|
||||
if document_storage
|
||||
else None
|
||||
)
|
||||
|
||||
if not extraction_results:
|
||||
raise HTTPException(
|
||||
status_code=404, detail="Extraction results not found"
|
||||
)
|
||||
|
||||
# pylint: disable-next=not-a-mapping
|
||||
return ExtractionResponse(**extraction_results)
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
"Failed to get extraction results", doc_id=doc_id, error=str(e)
|
||||
)
|
||||
raise HTTPException(
|
||||
status_code=500, detail="Failed to get extraction results"
|
||||
)
|
||||
|
||||
|
||||
async def _handle_ocr_ready(topic: str, payload: EventPayload) -> None:
|
||||
"""Handle OCR completion events"""
|
||||
try:
|
||||
data = payload.data
|
||||
doc_id = data.get("doc_id")
|
||||
tenant_id = data.get("tenant_id")
|
||||
|
||||
if not doc_id or not tenant_id:
|
||||
logger.warning("Invalid OCR ready event", data=data)
|
||||
return
|
||||
|
||||
logger.info("Auto-extracting fields from OCR results", doc_id=doc_id)
|
||||
|
||||
# Get OCR results
|
||||
ocr_results = data.get("ocr_results")
|
||||
if not ocr_results:
|
||||
ocr_results = (
|
||||
await document_storage.get_ocr_result(tenant_id, doc_id)
|
||||
if document_storage
|
||||
else None
|
||||
)
|
||||
|
||||
if ocr_results:
|
||||
await _extract_fields_async(
|
||||
doc_id=doc_id,
|
||||
tenant_id=tenant_id,
|
||||
ocr_results=ocr_results,
|
||||
strategy="hybrid",
|
||||
extraction_id=str(ulid.new()),
|
||||
actor=payload.actor,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Failed to handle OCR ready event", error=str(e))
|
||||
|
||||
|
||||
async def _extract_fields_async(
|
||||
doc_id: str,
|
||||
tenant_id: str,
|
||||
ocr_results: dict[str, Any],
|
||||
strategy: str,
|
||||
extraction_id: str,
|
||||
actor: str,
|
||||
) -> None:
|
||||
"""Extract fields asynchronously"""
|
||||
|
||||
with tracer.start_as_current_span("extract_fields_async") as span:
|
||||
span.set_attribute("doc_id", doc_id)
|
||||
span.set_attribute("extraction_id", extraction_id)
|
||||
span.set_attribute("strategy", strategy)
|
||||
|
||||
try:
|
||||
# Extract text from OCR results
|
||||
document_text = _extract_text_from_ocr(ocr_results)
|
||||
|
||||
# Determine field definitions based on document type
|
||||
field_definitions = _get_field_definitions(doc_id, document_text)
|
||||
|
||||
# Perform extraction
|
||||
if strategy == "llm":
|
||||
extracted_fields, confidence, provenance = await _extract_with_llm(
|
||||
document_text, field_definitions, ocr_results
|
||||
)
|
||||
elif strategy == "rules":
|
||||
extracted_fields, confidence, provenance = await _extract_with_rules(
|
||||
document_text, field_definitions, ocr_results
|
||||
)
|
||||
elif strategy == "hybrid":
|
||||
# Combine LLM and rules-based extraction
|
||||
llm_fields, llm_conf, llm_prov = await _extract_with_llm(
|
||||
document_text, field_definitions, ocr_results
|
||||
)
|
||||
rules_fields, rules_conf, rules_prov = await _extract_with_rules(
|
||||
document_text, field_definitions, ocr_results
|
||||
)
|
||||
|
||||
extracted_fields, confidence, provenance = _merge_extractions(
|
||||
llm_fields, llm_conf, llm_prov, rules_fields, rules_conf, rules_prov
|
||||
)
|
||||
else:
|
||||
raise ValueError(f"Unknown strategy: {strategy}")
|
||||
|
||||
# Calibrate confidence
|
||||
if confidence_calibrator and confidence_calibrator.is_fitted:
|
||||
calibrated_confidence = confidence_calibrator.calibrate([confidence])[0]
|
||||
else:
|
||||
calibrated_confidence = confidence
|
||||
|
||||
# Create extraction results
|
||||
extraction_results = {
|
||||
"doc_id": doc_id,
|
||||
"extraction_id": extraction_id,
|
||||
"strategy": strategy,
|
||||
"extracted_at": datetime.utcnow().isoformat(),
|
||||
"confidence": calibrated_confidence,
|
||||
"raw_confidence": confidence,
|
||||
"extracted_fields": extracted_fields,
|
||||
"provenance": provenance,
|
||||
"field_count": len(extracted_fields),
|
||||
}
|
||||
|
||||
# Store results
|
||||
if document_storage:
|
||||
await document_storage.store_extraction_result(
|
||||
tenant_id, doc_id, extraction_results
|
||||
)
|
||||
|
||||
# Update metrics
|
||||
metrics.counter("extractions_completed_total").labels(
|
||||
tenant_id=tenant_id, strategy=strategy
|
||||
).inc()
|
||||
|
||||
metrics.histogram("extraction_confidence").labels(
|
||||
strategy=strategy
|
||||
).observe(calibrated_confidence)
|
||||
|
||||
# Publish completion event
|
||||
event_payload = EventPayload(
|
||||
data={
|
||||
"doc_id": doc_id,
|
||||
"tenant_id": tenant_id,
|
||||
"extraction_id": extraction_id,
|
||||
"strategy": strategy,
|
||||
"confidence": calibrated_confidence,
|
||||
"field_count": len(extracted_fields),
|
||||
"extraction_results": extraction_results,
|
||||
},
|
||||
actor=actor,
|
||||
tenant_id=tenant_id,
|
||||
)
|
||||
|
||||
if event_bus:
|
||||
await event_bus.publish(EventTopics.DOC_EXTRACTED, event_payload)
|
||||
|
||||
logger.info(
|
||||
"Field extraction completed",
|
||||
doc_id=doc_id,
|
||||
fields=len(extracted_fields),
|
||||
confidence=calibrated_confidence,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Field extraction failed", doc_id=doc_id, error=str(e))
|
||||
|
||||
# Update error metrics
|
||||
metrics.counter("extraction_errors_total").labels(
|
||||
tenant_id=tenant_id, strategy=strategy, error_type=type(e).__name__
|
||||
).inc()
|
||||
|
||||
|
||||
def _extract_text_from_ocr(ocr_results: dict[str, Any]) -> str:
|
||||
"""Extract text from OCR results"""
|
||||
text_parts = []
|
||||
|
||||
for page in ocr_results.get("pages", []):
|
||||
if "text" in page:
|
||||
text_parts.append(page["text"])
|
||||
elif "tesseract" in page and "text" in page["tesseract"]:
|
||||
text_parts.append(page["tesseract"]["text"])
|
||||
|
||||
return "\n\n".join(text_parts)
|
||||
|
||||
|
||||
def _get_field_definitions(doc_id: str, document_text: str) -> dict[str, str]:
|
||||
"""Get field definitions based on document type"""
|
||||
|
||||
# Analyze document text to determine type
|
||||
text_lower = document_text.lower()
|
||||
|
||||
if "invoice" in text_lower or "bill" in text_lower:
|
||||
return {
|
||||
"invoice_number": "Invoice or bill number",
|
||||
"date": "Invoice date",
|
||||
"supplier_name": "Supplier or vendor name",
|
||||
"total_amount": "Total amount including VAT",
|
||||
"net_amount": "Net amount excluding VAT",
|
||||
"vat_amount": "VAT amount",
|
||||
"description": "Description of goods or services",
|
||||
}
|
||||
elif "bank statement" in text_lower or "account statement" in text_lower:
|
||||
return {
|
||||
"account_number": "Bank account number",
|
||||
"sort_code": "Bank sort code",
|
||||
"statement_period": "Statement period",
|
||||
"opening_balance": "Opening balance",
|
||||
"closing_balance": "Closing balance",
|
||||
"transactions": "List of transactions",
|
||||
}
|
||||
elif "receipt" in text_lower:
|
||||
return {
|
||||
"merchant_name": "Merchant or store name",
|
||||
"date": "Receipt date",
|
||||
"total_amount": "Total amount paid",
|
||||
"payment_method": "Payment method used",
|
||||
"items": "List of items purchased",
|
||||
}
|
||||
else:
|
||||
# Generic fields
|
||||
return {
|
||||
"date": "Any dates mentioned",
|
||||
"amount": "Any monetary amounts",
|
||||
"names": "Any person or company names",
|
||||
"addresses": "Any addresses",
|
||||
"reference_numbers": "Any reference or account numbers",
|
||||
}
|
||||
|
||||
|
||||
async def _extract_with_llm(
|
||||
document_text: str, field_definitions: dict[str, str], ocr_results: dict[str, Any]
|
||||
) -> tuple[dict[str, Any], float, list[dict[str, Any]]]:
|
||||
"""Extract fields using LLM"""
|
||||
|
||||
try:
|
||||
# This would integrate with OpenAI API
|
||||
# For now, return mock extraction
|
||||
logger.warning("LLM extraction not implemented, using mock data")
|
||||
|
||||
extracted_fields = {}
|
||||
provenance = []
|
||||
|
||||
# Mock extraction based on field definitions
|
||||
for field_name, _field_desc in field_definitions.items():
|
||||
if "amount" in field_name.lower():
|
||||
extracted_fields[field_name] = "£1,234.56"
|
||||
elif "date" in field_name.lower():
|
||||
extracted_fields[field_name] = "2024-01-15"
|
||||
elif "name" in field_name.lower():
|
||||
extracted_fields[field_name] = "Example Company Ltd"
|
||||
else:
|
||||
extracted_fields[field_name] = f"Mock {field_name}"
|
||||
|
||||
# Add provenance
|
||||
provenance.append(
|
||||
{
|
||||
"field": field_name,
|
||||
"value": extracted_fields[field_name],
|
||||
"confidence": 0.8,
|
||||
"source": "llm",
|
||||
"page": 1,
|
||||
"bbox": [100, 100, 200, 120],
|
||||
}
|
||||
)
|
||||
|
||||
return extracted_fields, 0.8, provenance
|
||||
|
||||
except Exception as e:
|
||||
logger.error("LLM extraction failed", error=str(e))
|
||||
return {}, 0.0, []
|
||||
|
||||
|
||||
async def _extract_with_rules(
|
||||
document_text: str, field_definitions: dict[str, str], ocr_results: dict[str, Any]
|
||||
) -> tuple[dict[str, Any], float, list[dict[str, Any]]]:
|
||||
"""Extract fields using rules-based approach"""
|
||||
|
||||
import re
|
||||
|
||||
extracted_fields = {}
|
||||
provenance = []
|
||||
|
||||
# Define extraction patterns
|
||||
patterns = {
|
||||
"amount": r"£\d{1,3}(?:,\d{3})*(?:\.\d{2})?",
|
||||
"date": r"\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b",
|
||||
"invoice_number": r"(?:invoice|inv|bill)\s*#?\s*(\w+)",
|
||||
"account_number": r"\b\d{8}\b",
|
||||
"sort_code": r"\b\d{2}-\d{2}-\d{2}\b",
|
||||
}
|
||||
|
||||
for field_name, _field_desc in field_definitions.items():
|
||||
# Find matching pattern
|
||||
pattern_key = None
|
||||
for key in patterns:
|
||||
if key in field_name.lower():
|
||||
pattern_key = key
|
||||
break
|
||||
|
||||
if pattern_key:
|
||||
pattern = patterns[pattern_key]
|
||||
matches = re.finditer(pattern, document_text, re.IGNORECASE)
|
||||
|
||||
for match in matches:
|
||||
value = match.group(1) if match.groups() else match.group(0)
|
||||
extracted_fields[field_name] = value
|
||||
|
||||
provenance.append(
|
||||
{
|
||||
"field": field_name,
|
||||
"value": value,
|
||||
"confidence": 0.9,
|
||||
"source": "rules",
|
||||
"pattern": pattern,
|
||||
"match_start": match.start(),
|
||||
"match_end": match.end(),
|
||||
}
|
||||
)
|
||||
break # Take first match
|
||||
|
||||
confidence = 0.9 if extracted_fields else 0.0
|
||||
return extracted_fields, confidence, provenance
|
||||
|
||||
|
||||
def _merge_extractions(
|
||||
llm_fields: dict[str, Any],
|
||||
llm_conf: float,
|
||||
llm_prov: list[dict[str, Any]],
|
||||
rules_fields: dict[str, Any],
|
||||
rules_conf: float,
|
||||
rules_prov: list[dict[str, Any]],
|
||||
) -> tuple[dict[str, Any], float, list[dict[str, Any]]]:
|
||||
"""Merge LLM and rules-based extractions"""
|
||||
|
||||
merged_fields = {}
|
||||
merged_provenance = []
|
||||
|
||||
# Get all field names
|
||||
all_fields = set(llm_fields.keys()) | set(rules_fields.keys())
|
||||
|
||||
for field in all_fields:
|
||||
llm_value = llm_fields.get(field)
|
||||
rules_value = rules_fields.get(field)
|
||||
|
||||
# Prefer rules-based extraction for structured fields
|
||||
if rules_value and field in ["amount", "date", "account_number", "sort_code"]:
|
||||
merged_fields[field] = rules_value
|
||||
# Find provenance for this field
|
||||
for prov in rules_prov:
|
||||
if prov["field"] == field:
|
||||
merged_provenance.append(prov)
|
||||
break
|
||||
elif llm_value:
|
||||
merged_fields[field] = llm_value
|
||||
# Find provenance for this field
|
||||
for prov in llm_prov:
|
||||
if prov["field"] == field:
|
||||
merged_provenance.append(prov)
|
||||
break
|
||||
|
||||
# Calculate combined confidence
|
||||
combined_confidence = (llm_conf + rules_conf) / 2
|
||||
|
||||
return merged_fields, combined_confidence, merged_provenance
|
||||
|
||||
|
||||
@app.exception_handler(HTTPException)
|
||||
async def http_exception_handler(request: Request, exc: HTTPException) -> JSONResponse:
|
||||
"""Handle HTTP exceptions with RFC7807 format"""
|
||||
return JSONResponse(
|
||||
status_code=exc.status_code,
|
||||
content=ErrorResponse(
|
||||
type=f"https://httpstatuses.com/{exc.status_code}",
|
||||
title=exc.detail,
|
||||
status=exc.status_code,
|
||||
detail=exc.detail,
|
||||
instance=str(request.url),
|
||||
trace_id=getattr(request.state, "trace_id", None),
|
||||
).model_dump(),
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import uvicorn
|
||||
|
||||
uvicorn.run("main:app", host="0.0.0.0", port=8003, reload=True, log_config=None)
|
||||
17
apps/svc_extract/requirements.txt
Normal file
17
apps/svc_extract/requirements.txt
Normal file
@@ -0,0 +1,17 @@
|
||||
# Service-specific dependencies for svc_extract
|
||||
# LLM integration
|
||||
openai>=1.3.0
|
||||
anthropic>=0.7.0
|
||||
|
||||
# JSON schema validation
|
||||
jsonschema>=4.20.0
|
||||
|
||||
# Template processing
|
||||
jinja2>=3.1.0
|
||||
|
||||
# Text similarity (lightweight)
|
||||
fuzzywuzzy>=0.18.0
|
||||
python-Levenshtein>=0.23.0
|
||||
|
||||
# Data validation
|
||||
cerberus>=1.3.4
|
||||
53
apps/svc_firm_connectors/Dockerfile
Normal file
53
apps/svc_firm_connectors/Dockerfile
Normal file
@@ -0,0 +1,53 @@
|
||||
# Multi-stage build for svc_firm_connectors
|
||||
FROM python:3.12-slim AS builder
|
||||
|
||||
# Install build dependencies
|
||||
RUN apt-get update && apt-get install -y \
|
||||
build-essential \
|
||||
curl \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Create virtual environment
|
||||
RUN python -m venv /opt/venv
|
||||
ENV PATH="/opt/venv/bin:$PATH"
|
||||
|
||||
# Copy requirements and install dependencies
|
||||
COPY libs/requirements-base.txt /tmp/libs-requirements.txt
|
||||
COPY apps/svc_firm_connectors/requirements.txt /tmp/requirements.txt
|
||||
RUN pip install --no-cache-dir --upgrade pip && \
|
||||
pip install --no-cache-dir -r /tmp/libs-requirements.txt -r /tmp/requirements.txt
|
||||
|
||||
# Production stage
|
||||
FROM python:3.12-slim
|
||||
|
||||
# Install runtime dependencies
|
||||
RUN apt-get update && apt-get install -y \
|
||||
curl \
|
||||
&& rm -rf /var/lib/apt/lists/* \
|
||||
&& groupadd -r appuser \
|
||||
&& useradd -r -g appuser appuser
|
||||
|
||||
# Copy virtual environment from builder
|
||||
COPY --from=builder /opt/venv /opt/venv
|
||||
ENV PATH="/opt/venv/bin:$PATH"
|
||||
|
||||
# Set working directory
|
||||
WORKDIR /app
|
||||
|
||||
# Copy application code
|
||||
COPY libs/ ./libs/
|
||||
COPY apps/svc_firm_connectors/ ./apps/svc_firm_connectors/
|
||||
|
||||
# Create non-root user and set permissions
|
||||
RUN chown -R appuser:appuser /app
|
||||
USER appuser
|
||||
|
||||
# Health check
|
||||
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
|
||||
CMD curl -f http://localhost:8000/healthz || exit 1
|
||||
|
||||
# Expose port
|
||||
EXPOSE 8000
|
||||
|
||||
# Run the application
|
||||
CMD ["python", "-m", "uvicorn", "apps.svc_firm_connectors.main:app", "--host", "0.0.0.0", "--port", "8000"]
|
||||
762
apps/svc_firm_connectors/main.py
Normal file
762
apps/svc_firm_connectors/main.py
Normal file
@@ -0,0 +1,762 @@
|
||||
# FILE: apps/svc-firm-connectors/main.py
|
||||
# mypy: disable-error-code=union-attr
|
||||
# Firm database integration with practice management systems
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import os
|
||||
|
||||
# Import shared libraries
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from typing import Any
|
||||
|
||||
import structlog
|
||||
import ulid
|
||||
from fastapi import BackgroundTasks, Depends, HTTPException, Request
|
||||
from fastapi.responses import JSONResponse
|
||||
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
|
||||
|
||||
from libs.app_factory import create_app
|
||||
from libs.config import (
|
||||
BaseAppSettings,
|
||||
create_event_bus,
|
||||
create_neo4j_client,
|
||||
create_vault_client,
|
||||
)
|
||||
from libs.events import EventBus, EventPayload, EventTopics
|
||||
from libs.neo import Neo4jClient
|
||||
from libs.observability import get_metrics, get_tracer, setup_observability
|
||||
from libs.schemas import ErrorResponse, FirmSyncRequest, FirmSyncResponse
|
||||
from libs.security import VaultTransitHelper, get_current_user, get_tenant_id
|
||||
|
||||
logger = structlog.get_logger()
|
||||
|
||||
|
||||
class FirmConnectorsSettings(BaseAppSettings):
|
||||
"""Settings for firm connectors service"""
|
||||
|
||||
service_name: str = "svc-firm-connectors"
|
||||
|
||||
# Supported practice management systems
|
||||
supported_systems: list[str] = [
|
||||
"iris",
|
||||
"sage",
|
||||
"xero",
|
||||
"quickbooks",
|
||||
"freeagent",
|
||||
"kashflow",
|
||||
]
|
||||
|
||||
# Sync configuration
|
||||
sync_batch_size: int = 100
|
||||
max_sync_retries: int = 3
|
||||
sync_timeout: int = 300 # 5 minutes
|
||||
|
||||
# Rate limiting
|
||||
api_rate_limit: int = 100 # requests per minute
|
||||
|
||||
# Data mapping
|
||||
field_mappings_dir: str = "config/firm_mappings"
|
||||
|
||||
|
||||
# Create app and settings
|
||||
app, settings = create_app(
|
||||
service_name="svc-firm-connectors",
|
||||
title="Tax Agent Firm Connectors Service",
|
||||
description="Practice management system integration",
|
||||
settings_class=FirmConnectorsSettings,
|
||||
)
|
||||
|
||||
# Global clients
|
||||
vault_helper: VaultTransitHelper | None = None
|
||||
neo4j_client: Neo4jClient | None = None
|
||||
event_bus: EventBus | None = None
|
||||
tracer = get_tracer("svc-firm-connectors")
|
||||
metrics = get_metrics()
|
||||
|
||||
|
||||
@app.on_event("startup")
|
||||
async def startup_event() -> None:
|
||||
"""Initialize service dependencies"""
|
||||
global vault_helper, neo4j_client, event_bus
|
||||
|
||||
logger.info("Starting firm connectors service")
|
||||
|
||||
# Setup observability
|
||||
setup_observability(settings)
|
||||
|
||||
# Initialize Vault helper
|
||||
vault_client = create_vault_client(settings)
|
||||
vault_helper = VaultTransitHelper(vault_client, "tax-agent-transit")
|
||||
|
||||
# Initialize Neo4j client
|
||||
neo4j_driver = create_neo4j_client(settings)
|
||||
neo4j_client = Neo4jClient(neo4j_driver)
|
||||
|
||||
# Initialize event bus
|
||||
event_bus = create_event_bus(settings)
|
||||
await event_bus.start() # fmt: skip # pyright: ignore[reportOptionalMemberAccess]
|
||||
|
||||
logger.info("Firm connectors service started successfully")
|
||||
|
||||
|
||||
@app.on_event("shutdown")
|
||||
async def shutdown_event() -> None:
|
||||
"""Cleanup service dependencies"""
|
||||
global neo4j_client, event_bus
|
||||
|
||||
logger.info("Shutting down firm connectors service")
|
||||
|
||||
if neo4j_client:
|
||||
await neo4j_client.close()
|
||||
|
||||
if event_bus:
|
||||
await event_bus.stop()
|
||||
|
||||
logger.info("Firm connectors service shutdown complete")
|
||||
|
||||
|
||||
@app.get("/health")
|
||||
async def health_check() -> dict[str, Any]:
|
||||
"""Health check endpoint"""
|
||||
return {
|
||||
"status": "healthy",
|
||||
"service": settings.service_name,
|
||||
"version": settings.service_version,
|
||||
"timestamp": datetime.utcnow().isoformat(),
|
||||
"supported_systems": settings.supported_systems,
|
||||
}
|
||||
|
||||
|
||||
@app.post("/sync", response_model=FirmSyncResponse)
|
||||
async def sync_firm_data(
|
||||
request_data: FirmSyncRequest,
|
||||
background_tasks: BackgroundTasks,
|
||||
current_user: dict[str, Any] = Depends(get_current_user),
|
||||
tenant_id: str = Depends(get_tenant_id),
|
||||
) -> FirmSyncResponse:
|
||||
"""Sync data from practice management system"""
|
||||
|
||||
with tracer.start_as_current_span("sync_firm_data") as span:
|
||||
span.set_attribute("system", request_data.system)
|
||||
span.set_attribute("tenant_id", tenant_id)
|
||||
span.set_attribute("sync_type", request_data.sync_type)
|
||||
|
||||
try:
|
||||
# Validate system
|
||||
if request_data.system not in settings.supported_systems:
|
||||
raise HTTPException(
|
||||
status_code=400, detail=f"Unsupported system: {request_data.system}"
|
||||
)
|
||||
|
||||
# Generate sync ID
|
||||
sync_id = str(ulid.new())
|
||||
span.set_attribute("sync_id", sync_id)
|
||||
|
||||
# Start background sync
|
||||
background_tasks.add_task(
|
||||
_sync_firm_data_async,
|
||||
request_data.system,
|
||||
request_data.sync_type,
|
||||
request_data.connection_config,
|
||||
tenant_id,
|
||||
sync_id,
|
||||
current_user.get("sub", "system"),
|
||||
)
|
||||
|
||||
logger.info(
|
||||
"Firm data sync started",
|
||||
sync_id=sync_id,
|
||||
system=request_data.system,
|
||||
sync_type=request_data.sync_type,
|
||||
)
|
||||
|
||||
return FirmSyncResponse(
|
||||
firm_id=request_data.firm_id,
|
||||
status="syncing",
|
||||
message=f"Sync started with ID: {sync_id}",
|
||||
synced_entities=0,
|
||||
errors=[],
|
||||
)
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error("Failed to start firm sync", error=str(e))
|
||||
raise HTTPException(status_code=500, detail="Failed to start firm sync")
|
||||
|
||||
|
||||
@app.get("/sync/{sync_id}")
|
||||
async def get_sync_status(
|
||||
sync_id: str,
|
||||
current_user: dict[str, Any] = Depends(get_current_user),
|
||||
tenant_id: str = Depends(get_tenant_id),
|
||||
) -> dict[str, Any]:
|
||||
"""Get sync status"""
|
||||
|
||||
with tracer.start_as_current_span("get_sync_status") as span:
|
||||
span.set_attribute("sync_id", sync_id)
|
||||
span.set_attribute("tenant_id", tenant_id)
|
||||
|
||||
try:
|
||||
# Get sync record from Neo4j
|
||||
query = """
|
||||
MATCH (s:FirmSync {sync_id: $sync_id, tenant_id: $tenant_id})
|
||||
WHERE s.retracted_at IS NULL
|
||||
RETURN s
|
||||
"""
|
||||
|
||||
results = await neo4j_client.run_query( # pyright: ignore[reportOptionalMemberAccess]
|
||||
query, {"sync_id": sync_id, "tenant_id": tenant_id}
|
||||
)
|
||||
|
||||
if not results:
|
||||
raise HTTPException(status_code=404, detail="Sync not found")
|
||||
|
||||
sync_record = results[0]["s"]
|
||||
|
||||
return {
|
||||
"sync_id": sync_id,
|
||||
"system": sync_record.get("system"),
|
||||
"status": sync_record.get("status"),
|
||||
"records_synced": sync_record.get("records_synced", 0),
|
||||
"total_records": sync_record.get("total_records", 0),
|
||||
"started_at": sync_record.get("started_at"),
|
||||
"completed_at": sync_record.get("completed_at"),
|
||||
"errors": json.loads(sync_record.get("errors", "[]")),
|
||||
}
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error("Failed to get sync status", sync_id=sync_id, error=str(e))
|
||||
raise HTTPException(status_code=500, detail="Failed to get sync status")
|
||||
|
||||
|
||||
@app.post("/connections/{system}/test")
|
||||
async def test_connection(
|
||||
system: str,
|
||||
connection_config: dict[str, Any],
|
||||
current_user: dict[str, Any] = Depends(get_current_user),
|
||||
tenant_id: str = Depends(get_tenant_id),
|
||||
) -> dict[str, Any]:
|
||||
"""Test connection to practice management system"""
|
||||
|
||||
with tracer.start_as_current_span("test_connection") as span:
|
||||
span.set_attribute("system", system)
|
||||
span.set_attribute("tenant_id", tenant_id)
|
||||
|
||||
try:
|
||||
# Validate system
|
||||
if system not in settings.supported_systems:
|
||||
raise HTTPException(
|
||||
status_code=400, detail=f"Unsupported system: {system}"
|
||||
)
|
||||
|
||||
# Test connection based on system
|
||||
if system == "iris":
|
||||
result = await _test_iris_connection(connection_config)
|
||||
elif system == "sage":
|
||||
result = await _test_sage_connection(connection_config)
|
||||
elif system == "xero":
|
||||
result = await _test_xero_connection(connection_config)
|
||||
elif system == "quickbooks":
|
||||
result = await _test_quickbooks_connection(connection_config)
|
||||
elif system == "freeagent":
|
||||
result = await _test_freeagent_connection(connection_config)
|
||||
elif system == "kashflow":
|
||||
result = await _test_kashflow_connection(connection_config)
|
||||
else:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"Connection test not implemented for {system}",
|
||||
)
|
||||
|
||||
return {
|
||||
"system": system,
|
||||
"connection_status": result["status"],
|
||||
"message": result["message"],
|
||||
"capabilities": result.get("capabilities", []),
|
||||
"test_timestamp": datetime.utcnow().isoformat(),
|
||||
}
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error("Connection test failed", system=system, error=str(e))
|
||||
raise HTTPException(
|
||||
status_code=500, detail=f"Connection test failed: {str(e)}"
|
||||
)
|
||||
|
||||
|
||||
@app.get("/systems")
|
||||
async def list_supported_systems(
|
||||
current_user: dict[str, Any] = Depends(get_current_user),
|
||||
tenant_id: str = Depends(get_tenant_id),
|
||||
) -> dict[str, Any]:
|
||||
"""List supported practice management systems"""
|
||||
|
||||
try:
|
||||
systems_info: list[Any] = []
|
||||
|
||||
for system in settings.supported_systems:
|
||||
system_info = {
|
||||
"system": system,
|
||||
"name": _get_system_name(system),
|
||||
"capabilities": _get_system_capabilities(system),
|
||||
"connection_fields": _get_connection_fields(system),
|
||||
}
|
||||
systems_info.append(system_info)
|
||||
|
||||
return {"supported_systems": systems_info, "total_systems": len(systems_info)}
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Failed to list systems", error=str(e))
|
||||
raise HTTPException(status_code=500, detail="Failed to list systems")
|
||||
|
||||
|
||||
async def _sync_firm_data_async(
|
||||
system: str,
|
||||
sync_type: str,
|
||||
connection_config: dict[str, Any],
|
||||
tenant_id: str,
|
||||
sync_id: str,
|
||||
actor: str,
|
||||
) -> None:
|
||||
"""Sync firm data asynchronously"""
|
||||
|
||||
with tracer.start_as_current_span("sync_firm_data_async") as span:
|
||||
span.set_attribute("sync_id", sync_id)
|
||||
span.set_attribute("system", system)
|
||||
span.set_attribute("sync_type", sync_type)
|
||||
|
||||
try:
|
||||
# Create sync record
|
||||
await _create_sync_record(sync_id, system, sync_type, tenant_id)
|
||||
|
||||
# Perform sync based on system
|
||||
if system == "iris":
|
||||
sync_result = await _sync_iris_data(
|
||||
connection_config, sync_type, tenant_id
|
||||
)
|
||||
elif system == "sage":
|
||||
sync_result = await _sync_sage_data(
|
||||
connection_config, sync_type, tenant_id
|
||||
)
|
||||
elif system == "xero":
|
||||
sync_result = await _sync_xero_data(
|
||||
connection_config, sync_type, tenant_id
|
||||
)
|
||||
elif system == "quickbooks":
|
||||
sync_result = await _sync_quickbooks_data(
|
||||
connection_config, sync_type, tenant_id
|
||||
)
|
||||
elif system == "freeagent":
|
||||
sync_result = await _sync_freeagent_data(
|
||||
connection_config, sync_type, tenant_id
|
||||
)
|
||||
elif system == "kashflow":
|
||||
sync_result = await _sync_kashflow_data(
|
||||
connection_config, sync_type, tenant_id
|
||||
)
|
||||
else:
|
||||
raise Exception(f"Sync not implemented for {system}")
|
||||
|
||||
# Update sync record
|
||||
await _update_sync_record(sync_id, "completed", sync_result)
|
||||
|
||||
# Update metrics
|
||||
metrics.counter("firm_syncs_completed_total").labels(
|
||||
tenant_id=tenant_id, system=system, sync_type=sync_type
|
||||
).inc()
|
||||
|
||||
metrics.histogram("sync_records_count").labels(
|
||||
system=system, sync_type=sync_type
|
||||
).observe(sync_result["records_synced"])
|
||||
|
||||
# Publish completion event
|
||||
event_payload = EventPayload(
|
||||
data={
|
||||
"sync_id": sync_id,
|
||||
"system": system,
|
||||
"sync_type": sync_type,
|
||||
"tenant_id": tenant_id,
|
||||
"records_synced": sync_result["records_synced"],
|
||||
"entities_created": sync_result.get("entities_created", 0),
|
||||
},
|
||||
actor=actor,
|
||||
tenant_id=tenant_id,
|
||||
)
|
||||
|
||||
await event_bus.publish(EventTopics.FIRM_SYNC_COMPLETED, event_payload) # type: ignore
|
||||
|
||||
logger.info(
|
||||
"Firm sync completed",
|
||||
sync_id=sync_id,
|
||||
system=system,
|
||||
records=sync_result["records_synced"],
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Firm sync failed", sync_id=sync_id, error=str(e))
|
||||
|
||||
# Update sync record with error
|
||||
await _update_sync_record(sync_id, "error", {"error": str(e)})
|
||||
|
||||
# Update error metrics
|
||||
metrics.counter("firm_sync_errors_total").labels(
|
||||
tenant_id=tenant_id, system=system, error_type=type(e).__name__
|
||||
).inc()
|
||||
|
||||
|
||||
async def _test_iris_connection(config: dict[str, Any]) -> dict[str, Any]:
|
||||
"""Test IRIS connection"""
|
||||
# Mock implementation
|
||||
await asyncio.sleep(1)
|
||||
return {
|
||||
"status": "success",
|
||||
"message": "Connection successful",
|
||||
"capabilities": ["clients", "jobs", "documents"],
|
||||
}
|
||||
|
||||
|
||||
async def _test_sage_connection(config: dict[str, Any]) -> dict[str, Any]:
|
||||
"""Test Sage connection"""
|
||||
# Mock implementation
|
||||
await asyncio.sleep(1)
|
||||
return {
|
||||
"status": "success",
|
||||
"message": "Connection successful",
|
||||
"capabilities": ["customers", "suppliers", "transactions"],
|
||||
}
|
||||
|
||||
|
||||
async def _test_xero_connection(config: dict[str, Any]) -> dict[str, Any]:
|
||||
"""Test Xero connection"""
|
||||
# Mock implementation
|
||||
await asyncio.sleep(1)
|
||||
return {
|
||||
"status": "success",
|
||||
"message": "Connection successful",
|
||||
"capabilities": ["contacts", "invoices", "bank_transactions"],
|
||||
}
|
||||
|
||||
|
||||
async def _test_quickbooks_connection(config: dict[str, Any]) -> dict[str, Any]:
|
||||
"""Test QuickBooks connection"""
|
||||
# Mock implementation
|
||||
await asyncio.sleep(1)
|
||||
return {
|
||||
"status": "success",
|
||||
"message": "Connection successful",
|
||||
"capabilities": ["customers", "vendors", "items", "transactions"],
|
||||
}
|
||||
|
||||
|
||||
async def _test_freeagent_connection(config: dict[str, Any]) -> dict[str, Any]:
|
||||
"""Test FreeAgent connection"""
|
||||
# Mock implementation
|
||||
await asyncio.sleep(1)
|
||||
return {
|
||||
"status": "success",
|
||||
"message": "Connection successful",
|
||||
"capabilities": ["contacts", "projects", "invoices", "expenses"],
|
||||
}
|
||||
|
||||
|
||||
async def _test_kashflow_connection(config: dict[str, Any]) -> dict[str, Any]:
|
||||
"""Test KashFlow connection"""
|
||||
# Mock implementation
|
||||
await asyncio.sleep(1)
|
||||
return {
|
||||
"status": "success",
|
||||
"message": "Connection successful",
|
||||
"capabilities": ["customers", "suppliers", "invoices", "receipts"],
|
||||
}
|
||||
|
||||
|
||||
async def _sync_iris_data(
|
||||
config: dict[str, Any], sync_type: str, tenant_id: str
|
||||
) -> dict[str, Any]:
|
||||
"""Sync data from IRIS"""
|
||||
# Mock implementation
|
||||
await asyncio.sleep(2)
|
||||
|
||||
# Simulate syncing client data
|
||||
mock_clients = [
|
||||
{"id": "client_1", "name": "John Doe", "utr": "1234567890"},
|
||||
{"id": "client_2", "name": "Jane Smith", "utr": "0987654321"},
|
||||
]
|
||||
|
||||
entities_created = 0
|
||||
for client in mock_clients:
|
||||
# Create taxpayer profile in KG
|
||||
taxpayer_properties = {
|
||||
"taxpayer_id": client["id"],
|
||||
"name": client["name"],
|
||||
"utr": client["utr"],
|
||||
"tenant_id": tenant_id,
|
||||
"source": "iris_sync",
|
||||
"extractor_version": "1.0.0",
|
||||
"valid_from": datetime.utcnow(),
|
||||
"asserted_at": datetime.utcnow(),
|
||||
}
|
||||
|
||||
await neo4j_client.create_node("TaxpayerProfile", taxpayer_properties) # fmt: skip # pyright: ignore[reportOptionalMemberAccess]
|
||||
entities_created += 1
|
||||
|
||||
return {
|
||||
"records_synced": len(mock_clients),
|
||||
"entities_created": entities_created,
|
||||
"sync_type": sync_type,
|
||||
}
|
||||
|
||||
|
||||
async def _sync_sage_data(
|
||||
config: dict[str, Any], sync_type: str, tenant_id: str
|
||||
) -> dict[str, Any]:
|
||||
"""Sync data from Sage"""
|
||||
# Mock implementation
|
||||
await asyncio.sleep(2)
|
||||
return {"records_synced": 5, "entities_created": 5, "sync_type": sync_type}
|
||||
|
||||
|
||||
async def _sync_xero_data(
|
||||
config: dict[str, Any], sync_type: str, tenant_id: str
|
||||
) -> dict[str, Any]:
|
||||
"""Sync data from Xero"""
|
||||
# Mock implementation
|
||||
await asyncio.sleep(2)
|
||||
return {"records_synced": 8, "entities_created": 8, "sync_type": sync_type}
|
||||
|
||||
|
||||
async def _sync_quickbooks_data(
|
||||
config: dict[str, Any], sync_type: str, tenant_id: str
|
||||
) -> dict[str, Any]:
|
||||
"""Sync data from QuickBooks"""
|
||||
# Mock implementation
|
||||
await asyncio.sleep(2)
|
||||
return {"records_synced": 12, "entities_created": 12, "sync_type": sync_type}
|
||||
|
||||
|
||||
async def _sync_freeagent_data(
|
||||
config: dict[str, Any], sync_type: str, tenant_id: str
|
||||
) -> dict[str, Any]:
|
||||
"""Sync data from FreeAgent"""
|
||||
# Mock implementation
|
||||
await asyncio.sleep(2)
|
||||
return {"records_synced": 6, "entities_created": 6, "sync_type": sync_type}
|
||||
|
||||
|
||||
async def _sync_kashflow_data(
|
||||
config: dict[str, Any], sync_type: str, tenant_id: str
|
||||
) -> dict[str, Any]:
|
||||
"""Sync data from KashFlow"""
|
||||
# Mock implementation
|
||||
await asyncio.sleep(2)
|
||||
return {"records_synced": 4, "entities_created": 4, "sync_type": sync_type}
|
||||
|
||||
|
||||
def _get_system_name(system: str) -> str:
|
||||
"""Get human-readable system name"""
|
||||
names = {
|
||||
"iris": "IRIS Practice Management",
|
||||
"sage": "Sage Practice Management",
|
||||
"xero": "Xero",
|
||||
"quickbooks": "QuickBooks",
|
||||
"freeagent": "FreeAgent",
|
||||
"kashflow": "KashFlow",
|
||||
}
|
||||
return names.get(system, system.title())
|
||||
|
||||
|
||||
def _get_system_capabilities(system: str) -> list[str]:
|
||||
"""Get system capabilities"""
|
||||
capabilities = {
|
||||
"iris": ["clients", "jobs", "documents", "time_tracking"],
|
||||
"sage": ["customers", "suppliers", "transactions", "reports"],
|
||||
"xero": ["contacts", "invoices", "bank_transactions", "reports"],
|
||||
"quickbooks": ["customers", "vendors", "items", "transactions", "reports"],
|
||||
"freeagent": ["contacts", "projects", "invoices", "expenses", "time_tracking"],
|
||||
"kashflow": ["customers", "suppliers", "invoices", "receipts", "reports"],
|
||||
}
|
||||
return capabilities.get(system, [])
|
||||
|
||||
|
||||
def _get_connection_fields(system: str) -> list[dict[str, Any]]:
|
||||
"""Get required connection fields for system"""
|
||||
fields = {
|
||||
"iris": [
|
||||
{
|
||||
"name": "api_key",
|
||||
"type": "string",
|
||||
"required": True,
|
||||
"description": "IRIS API Key",
|
||||
},
|
||||
{
|
||||
"name": "base_url",
|
||||
"type": "string",
|
||||
"required": True,
|
||||
"description": "IRIS Base URL",
|
||||
},
|
||||
],
|
||||
"sage": [
|
||||
{
|
||||
"name": "username",
|
||||
"type": "string",
|
||||
"required": True,
|
||||
"description": "Sage Username",
|
||||
},
|
||||
{
|
||||
"name": "password",
|
||||
"type": "password",
|
||||
"required": True,
|
||||
"description": "Sage Password",
|
||||
},
|
||||
{
|
||||
"name": "database",
|
||||
"type": "string",
|
||||
"required": True,
|
||||
"description": "Database Name",
|
||||
},
|
||||
],
|
||||
"xero": [
|
||||
{
|
||||
"name": "client_id",
|
||||
"type": "string",
|
||||
"required": True,
|
||||
"description": "Xero Client ID",
|
||||
},
|
||||
{
|
||||
"name": "client_secret",
|
||||
"type": "password",
|
||||
"required": True,
|
||||
"description": "Xero Client Secret",
|
||||
},
|
||||
{
|
||||
"name": "tenant_id",
|
||||
"type": "string",
|
||||
"required": True,
|
||||
"description": "Xero Tenant ID",
|
||||
},
|
||||
],
|
||||
"quickbooks": [
|
||||
{
|
||||
"name": "client_id",
|
||||
"type": "string",
|
||||
"required": True,
|
||||
"description": "QuickBooks Client ID",
|
||||
},
|
||||
{
|
||||
"name": "client_secret",
|
||||
"type": "password",
|
||||
"required": True,
|
||||
"description": "QuickBooks Client Secret",
|
||||
},
|
||||
{
|
||||
"name": "company_id",
|
||||
"type": "string",
|
||||
"required": True,
|
||||
"description": "Company ID",
|
||||
},
|
||||
],
|
||||
"freeagent": [
|
||||
{
|
||||
"name": "client_id",
|
||||
"type": "string",
|
||||
"required": True,
|
||||
"description": "FreeAgent Client ID",
|
||||
},
|
||||
{
|
||||
"name": "client_secret",
|
||||
"type": "password",
|
||||
"required": True,
|
||||
"description": "FreeAgent Client Secret",
|
||||
},
|
||||
],
|
||||
"kashflow": [
|
||||
{
|
||||
"name": "username",
|
||||
"type": "string",
|
||||
"required": True,
|
||||
"description": "KashFlow Username",
|
||||
},
|
||||
{
|
||||
"name": "password",
|
||||
"type": "password",
|
||||
"required": True,
|
||||
"description": "KashFlow Password",
|
||||
},
|
||||
],
|
||||
}
|
||||
return fields.get(system, [])
|
||||
|
||||
|
||||
async def _create_sync_record(
|
||||
sync_id: str, system: str, sync_type: str, tenant_id: str
|
||||
) -> None:
|
||||
"""Create sync record in knowledge graph"""
|
||||
|
||||
sync_properties = {
|
||||
"sync_id": sync_id,
|
||||
"system": system,
|
||||
"sync_type": sync_type,
|
||||
"tenant_id": tenant_id,
|
||||
"status": "running",
|
||||
"started_at": datetime.utcnow().isoformat(),
|
||||
"records_synced": 0,
|
||||
"errors": "[]",
|
||||
"source": "firm_connectors",
|
||||
"extractor_version": "1.0.0",
|
||||
"valid_from": datetime.utcnow(),
|
||||
"asserted_at": datetime.utcnow(),
|
||||
}
|
||||
|
||||
await neo4j_client.create_node("FirmSync", sync_properties) # fmt: skip # pyright: ignore[reportOptionalMemberAccess]
|
||||
|
||||
|
||||
async def _update_sync_record(
|
||||
sync_id: str, status: str, result: dict[str, Any]
|
||||
) -> None:
|
||||
"""Update sync record with results"""
|
||||
|
||||
update_properties = {
|
||||
"status": status,
|
||||
"completed_at": datetime.utcnow().isoformat(),
|
||||
"records_synced": result.get("records_synced", 0),
|
||||
"total_records": result.get("total_records", 0),
|
||||
"errors": json.dumps(result.get("errors", [])),
|
||||
}
|
||||
|
||||
# This would update the existing node
|
||||
# For now, just log
|
||||
logger.debug(
|
||||
"Sync record updated",
|
||||
sync_id=sync_id,
|
||||
status=status,
|
||||
properties=update_properties,
|
||||
)
|
||||
|
||||
|
||||
@app.exception_handler(HTTPException)
|
||||
async def http_exception_handler(request: Request, exc: HTTPException) -> JSONResponse:
|
||||
"""Handle HTTP exceptions with RFC7807 format"""
|
||||
return JSONResponse(
|
||||
status_code=exc.status_code,
|
||||
content=ErrorResponse(
|
||||
type=f"https://httpstatuses.com/{exc.status_code}",
|
||||
title=exc.detail,
|
||||
status=exc.status_code,
|
||||
detail=exc.detail,
|
||||
instance=str(request.url),
|
||||
trace_id="",
|
||||
).model_dump(),
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import uvicorn
|
||||
|
||||
uvicorn.run("main:app", host="0.0.0.0", port=8011, reload=True, log_config=None)
|
||||
45
apps/svc_firm_connectors/requirements.txt
Normal file
45
apps/svc_firm_connectors/requirements.txt
Normal file
@@ -0,0 +1,45 @@
|
||||
# FastAPI and server
|
||||
fastapi>=0.104.1
|
||||
uvicorn[standard]>=0.24.0
|
||||
pydantic>=2.5.0
|
||||
|
||||
# Service-specific dependencies
|
||||
# Database connectors
|
||||
sqlalchemy>=2.0.0
|
||||
pymssql>=2.2.0
|
||||
cx-Oracle>=8.3.0
|
||||
|
||||
# API clients for practice management systems
|
||||
zeep>=4.2.0 # SOAP client
|
||||
xmltodict>=0.13.0
|
||||
|
||||
# OAuth for various systems
|
||||
authlib>=1.2.0
|
||||
requests-oauthlib>=1.3.0
|
||||
|
||||
# Data synchronization
|
||||
pandas>=2.1.0
|
||||
|
||||
# Rate limiting
|
||||
ratelimit>=2.2.0
|
||||
|
||||
# Retry mechanisms
|
||||
tenacity>=8.2.0
|
||||
|
||||
# CSV processing
|
||||
csvkit>=1.1.0
|
||||
|
||||
# Excel file processing
|
||||
openpyxl>=3.1.0
|
||||
xlrd>=2.0.0
|
||||
|
||||
# Data validation
|
||||
marshmallow>=3.20.0
|
||||
cerberus>=1.3.4
|
||||
|
||||
# Connection pooling (built into SQLAlchemy)
|
||||
# sqlalchemy-pool>=1.3.0 # Package doesn't exist, pooling is built into SQLAlchemy
|
||||
|
||||
# Additional utilities
|
||||
python-dateutil>=2.8.0
|
||||
pytz>=2023.3
|
||||
53
apps/svc_forms/Dockerfile
Normal file
53
apps/svc_forms/Dockerfile
Normal file
@@ -0,0 +1,53 @@
|
||||
# Multi-stage build for svc_forms
|
||||
FROM python:3.12-slim AS builder
|
||||
|
||||
# Install build dependencies
|
||||
RUN apt-get update && apt-get install -y \
|
||||
build-essential \
|
||||
curl \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Create virtual environment
|
||||
RUN python -m venv /opt/venv
|
||||
ENV PATH="/opt/venv/bin:$PATH"
|
||||
|
||||
# Copy requirements and install dependencies
|
||||
COPY libs/requirements-base.txt /tmp/libs-requirements.txt
|
||||
COPY apps/svc_forms/requirements.txt /tmp/requirements.txt
|
||||
RUN pip install --no-cache-dir --upgrade pip && \
|
||||
pip install --no-cache-dir -r /tmp/libs-requirements.txt -r /tmp/requirements.txt
|
||||
|
||||
# Production stage
|
||||
FROM python:3.12-slim
|
||||
|
||||
# Install runtime dependencies
|
||||
RUN apt-get update && apt-get install -y \
|
||||
curl \
|
||||
&& rm -rf /var/lib/apt/lists/* \
|
||||
&& groupadd -r appuser \
|
||||
&& useradd -r -g appuser appuser
|
||||
|
||||
# Copy virtual environment from builder
|
||||
COPY --from=builder /opt/venv /opt/venv
|
||||
ENV PATH="/opt/venv/bin:$PATH"
|
||||
|
||||
# Set working directory
|
||||
WORKDIR /app
|
||||
|
||||
# Copy application code
|
||||
COPY libs/ ./libs/
|
||||
COPY apps/svc_forms/ ./apps/svc_forms/
|
||||
|
||||
# Create non-root user and set permissions
|
||||
RUN chown -R appuser:appuser /app
|
||||
USER appuser
|
||||
|
||||
# Health check
|
||||
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
|
||||
CMD curl -f http://localhost:8000/healthz || exit 1
|
||||
|
||||
# Expose port
|
||||
EXPOSE 8000
|
||||
|
||||
# Run the application
|
||||
CMD ["python", "-m", "uvicorn", "apps.svc_forms.main:app", "--host", "0.0.0.0", "--port", "8000"]
|
||||
625
apps/svc_forms/main.py
Normal file
625
apps/svc_forms/main.py
Normal file
@@ -0,0 +1,625 @@
|
||||
"""PDF form filling with evidence pack generation."""
|
||||
|
||||
# FILE: apps/svc-forms/main.py
|
||||
# pylint: disable=wrong-import-position,import-error,too-few-public-methods,global-statement
|
||||
# pylint: disable=global-variable-not-assigned,raise-missing-from,unused-argument
|
||||
# pylint: disable=broad-exception-caught,no-else-return,too-many-arguments,too-many-positional-arguments
|
||||
# pylint: disable=too-many-locals,import-outside-toplevel
|
||||
# mypy: disable-error-code=union-attr
|
||||
|
||||
|
||||
import os
|
||||
|
||||
# Import shared libraries
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from io import BytesIO
|
||||
from typing import Any
|
||||
|
||||
import structlog
|
||||
import ulid
|
||||
from fastapi import BackgroundTasks, Depends, HTTPException, Request
|
||||
from fastapi.responses import JSONResponse, Response
|
||||
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
|
||||
|
||||
from libs.app_factory import create_app
|
||||
from libs.config import (
|
||||
BaseAppSettings,
|
||||
create_event_bus,
|
||||
create_minio_client,
|
||||
create_neo4j_client,
|
||||
)
|
||||
from libs.events import EventBus, EventPayload, EventTopics
|
||||
from libs.forms import UK_TAX_FORMS, EvidencePackGenerator, PDFFormFiller
|
||||
from libs.neo import Neo4jClient
|
||||
from libs.observability import get_metrics, get_tracer, setup_observability
|
||||
from libs.schemas import ErrorResponse
|
||||
from libs.security import get_current_user, get_tenant_id
|
||||
from libs.storage import DocumentStorage, StorageClient
|
||||
|
||||
logger = structlog.get_logger()
|
||||
|
||||
|
||||
class FormsSettings(BaseAppSettings):
|
||||
"""Settings for forms service"""
|
||||
|
||||
service_name: str = "svc-forms"
|
||||
|
||||
# Form templates
|
||||
forms_template_dir: str = "forms/templates"
|
||||
output_bucket: str = "filled-forms"
|
||||
evidence_packs_bucket: str = "evidence-packs"
|
||||
|
||||
# Supported forms
|
||||
supported_forms: list[str] = ["SA100", "SA103", "SA105", "SA106"]
|
||||
|
||||
# PDF configuration
|
||||
pdf_quality: str = "high"
|
||||
flatten_forms: bool = True
|
||||
|
||||
|
||||
# Create app and settings
|
||||
app, settings = create_app(
|
||||
service_name="svc-forms",
|
||||
title="Tax Agent Forms Service",
|
||||
description="PDF form filling and evidence pack generation",
|
||||
settings_class=FormsSettings,
|
||||
)
|
||||
|
||||
# Global clients
|
||||
storage_client: StorageClient | None = None
|
||||
document_storage: DocumentStorage | None = None
|
||||
neo4j_client: Neo4jClient | None = None
|
||||
pdf_form_filler: PDFFormFiller | None = None
|
||||
evidence_pack_generator: EvidencePackGenerator | None = None
|
||||
event_bus: EventBus | None = None
|
||||
tracer = get_tracer("svc-forms")
|
||||
metrics = get_metrics()
|
||||
|
||||
|
||||
@app.on_event("startup")
|
||||
async def startup_event() -> None:
|
||||
"""Initialize service dependencies"""
|
||||
global storage_client, document_storage, neo4j_client, pdf_form_filler # pylint: disable=line-too-long
|
||||
global evidence_pack_generator, event_bus
|
||||
|
||||
logger.info("Starting forms service")
|
||||
|
||||
# Setup observability
|
||||
setup_observability(settings)
|
||||
|
||||
# Initialize MinIO client
|
||||
minio_client = create_minio_client(settings)
|
||||
storage_client = StorageClient(minio_client)
|
||||
document_storage = DocumentStorage(storage_client)
|
||||
|
||||
# Initialize Neo4j client
|
||||
neo4j_driver = create_neo4j_client(settings)
|
||||
neo4j_client = Neo4jClient(neo4j_driver)
|
||||
|
||||
# Initialize PDF form filler
|
||||
pdf_form_filler = PDFFormFiller()
|
||||
|
||||
# Load form templates
|
||||
for form_id in settings.supported_forms:
|
||||
template_path = os.path.join(settings.forms_template_dir, f"{form_id}.pdf")
|
||||
if os.path.exists(template_path):
|
||||
pdf_form_filler.load_template(form_id, template_path)
|
||||
else:
|
||||
logger.warning(
|
||||
"Form template not found", form_id=form_id, path=template_path
|
||||
)
|
||||
|
||||
# Initialize evidence pack generator
|
||||
evidence_pack_generator = EvidencePackGenerator(storage_client)
|
||||
|
||||
# Initialize event bus
|
||||
event_bus = create_event_bus(settings)
|
||||
await event_bus.start() # fmt: skip # pyright: ignore[reportOptionalMemberAccess]
|
||||
|
||||
# Subscribe to calculation completion events
|
||||
await event_bus.subscribe( # fmt: skip # pyright: ignore[reportOptionalMemberAccess]
|
||||
EventTopics.CALC_SCHEDULE_READY, _handle_calculation_ready
|
||||
)
|
||||
|
||||
# Ensure buckets exist
|
||||
await storage_client.ensure_bucket(settings.output_bucket)
|
||||
await storage_client.ensure_bucket(settings.evidence_packs_bucket)
|
||||
|
||||
logger.info("Forms service started successfully")
|
||||
|
||||
|
||||
@app.on_event("shutdown")
|
||||
async def shutdown_event() -> None:
|
||||
"""Cleanup service dependencies"""
|
||||
global neo4j_client, event_bus
|
||||
|
||||
logger.info("Shutting down forms service")
|
||||
|
||||
if neo4j_client:
|
||||
await neo4j_client.close()
|
||||
|
||||
if event_bus:
|
||||
await event_bus.stop()
|
||||
|
||||
logger.info("Forms service shutdown complete")
|
||||
|
||||
|
||||
@app.get("/health")
|
||||
async def health_check() -> dict[str, Any]:
|
||||
"""Health check endpoint"""
|
||||
return {
|
||||
"status": "healthy",
|
||||
"service": settings.service_name,
|
||||
"version": "1.0.0",
|
||||
"timestamp": datetime.now().isoformat(),
|
||||
"supported_forms": settings.supported_forms,
|
||||
}
|
||||
|
||||
|
||||
@app.post("/fill/{form_id}")
|
||||
async def fill_form(
|
||||
form_id: str,
|
||||
field_values: dict[str, Any],
|
||||
background_tasks: BackgroundTasks,
|
||||
current_user: dict[str, Any] = Depends(get_current_user),
|
||||
tenant_id: str = Depends(get_tenant_id),
|
||||
) -> dict[str, Any]:
|
||||
"""Fill PDF form with provided values"""
|
||||
|
||||
with tracer.start_as_current_span("fill_form") as span:
|
||||
span.set_attribute("form_id", form_id)
|
||||
span.set_attribute("tenant_id", tenant_id)
|
||||
span.set_attribute("field_count", len(field_values))
|
||||
|
||||
try:
|
||||
# Validate form ID
|
||||
if form_id not in settings.supported_forms:
|
||||
raise HTTPException(
|
||||
status_code=400, detail=f"Unsupported form: {form_id}"
|
||||
)
|
||||
|
||||
# Generate filling ID
|
||||
filling_id = str(ulid.new())
|
||||
span.set_attribute("filling_id", filling_id)
|
||||
|
||||
# Start background form filling
|
||||
background_tasks.add_task(
|
||||
_fill_form_async,
|
||||
form_id,
|
||||
field_values,
|
||||
tenant_id,
|
||||
filling_id,
|
||||
current_user.get("sub", "system"),
|
||||
)
|
||||
|
||||
logger.info("Form filling started", form_id=form_id, filling_id=filling_id)
|
||||
|
||||
return {
|
||||
"filling_id": filling_id,
|
||||
"form_id": form_id,
|
||||
"status": "filling",
|
||||
"field_count": len(field_values),
|
||||
}
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error("Failed to start form filling", form_id=form_id, error=str(e))
|
||||
raise HTTPException(status_code=500, detail="Failed to start form filling")
|
||||
|
||||
|
||||
@app.post("/fill-from-calculation/{calculation_id}")
|
||||
async def fill_form_from_calculation(
|
||||
calculation_id: str,
|
||||
background_tasks: BackgroundTasks,
|
||||
current_user: dict[str, Any] = Depends(get_current_user),
|
||||
tenant_id: str = Depends(get_tenant_id),
|
||||
) -> dict[str, Any]:
|
||||
"""Fill form using calculation results"""
|
||||
|
||||
with tracer.start_as_current_span("fill_form_from_calculation") as span:
|
||||
span.set_attribute("calculation_id", calculation_id)
|
||||
span.set_attribute("tenant_id", tenant_id)
|
||||
|
||||
try:
|
||||
# Get calculation from Neo4j
|
||||
calc_query = """
|
||||
MATCH (c:Calculation {calculation_id: $calculation_id, tenant_id: $tenant_id})
|
||||
WHERE c.retracted_at IS NULL
|
||||
RETURN c
|
||||
"""
|
||||
|
||||
calc_results = await neo4j_client.run_query( # pyright: ignore[reportOptionalMemberAccess]
|
||||
calc_query, {"calculation_id": calculation_id, "tenant_id": tenant_id}
|
||||
)
|
||||
|
||||
if not calc_results:
|
||||
raise HTTPException(status_code=404, detail="Calculation not found")
|
||||
|
||||
calculation = calc_results[0]["c"]
|
||||
form_id = calculation.get("schedule")
|
||||
|
||||
if not form_id:
|
||||
raise HTTPException(
|
||||
status_code=400, detail="No schedule found in calculation"
|
||||
)
|
||||
|
||||
# Get form boxes
|
||||
boxes_query = """
|
||||
MATCH (c:Calculation {calculation_id: $calculation_id})-[:HAS_BOX]->(b:FormBox)
|
||||
WHERE c.retracted_at IS NULL AND b.retracted_at IS NULL
|
||||
RETURN b
|
||||
"""
|
||||
|
||||
box_results = await neo4j_client.run_query( # pyright: ignore[reportOptionalMemberAccess]
|
||||
boxes_query, {"calculation_id": calculation_id}
|
||||
)
|
||||
|
||||
# Convert form boxes to field values
|
||||
field_values = {}
|
||||
for box_result in box_results:
|
||||
box = box_result["b"]
|
||||
field_values[f"box_{box['box']}"] = box["value"]
|
||||
|
||||
# Generate filling ID
|
||||
filling_id = str(ulid.new())
|
||||
span.set_attribute("filling_id", filling_id)
|
||||
span.set_attribute("form_id", form_id)
|
||||
|
||||
# Start background form filling
|
||||
background_tasks.add_task(
|
||||
_fill_form_async,
|
||||
form_id,
|
||||
field_values,
|
||||
tenant_id,
|
||||
filling_id,
|
||||
current_user.get("sub", "system"),
|
||||
calculation_id,
|
||||
)
|
||||
|
||||
logger.info(
|
||||
"Form filling from calculation started",
|
||||
form_id=form_id,
|
||||
filling_id=filling_id,
|
||||
calculation_id=calculation_id,
|
||||
)
|
||||
|
||||
return {
|
||||
"filling_id": filling_id,
|
||||
"form_id": form_id,
|
||||
"calculation_id": calculation_id,
|
||||
"status": "filling",
|
||||
"field_count": len(field_values),
|
||||
}
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
"Failed to fill form from calculation",
|
||||
calculation_id=calculation_id,
|
||||
error=str(e),
|
||||
)
|
||||
raise HTTPException(
|
||||
status_code=500, detail="Failed to fill form from calculation"
|
||||
)
|
||||
|
||||
|
||||
@app.get("/download/{filling_id}")
|
||||
async def download_filled_form(
|
||||
filling_id: str,
|
||||
current_user: dict[str, Any] = Depends(get_current_user),
|
||||
tenant_id: str = Depends(get_tenant_id),
|
||||
) -> Response:
|
||||
"""Download filled form"""
|
||||
|
||||
with tracer.start_as_current_span("download_filled_form") as span:
|
||||
span.set_attribute("filling_id", filling_id)
|
||||
span.set_attribute("tenant_id", tenant_id)
|
||||
|
||||
try:
|
||||
# Get filled form from storage
|
||||
object_key = f"tenants/{tenant_id}/filled/{filling_id}.pdf"
|
||||
|
||||
form_content = await storage_client.get_object( # pyright: ignore[reportOptionalMemberAccess]
|
||||
settings.output_bucket, object_key
|
||||
)
|
||||
|
||||
if not form_content:
|
||||
raise HTTPException(status_code=404, detail="Filled form not found")
|
||||
|
||||
return Response(
|
||||
content=form_content,
|
||||
media_type="application/pdf",
|
||||
headers={
|
||||
"Content-Disposition": f"attachment; filename={filling_id}.pdf"
|
||||
},
|
||||
)
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
"Failed to download filled form", filling_id=filling_id, error=str(e)
|
||||
)
|
||||
raise HTTPException(
|
||||
status_code=500, detail="Failed to download filled form"
|
||||
)
|
||||
|
||||
|
||||
@app.post("/evidence-pack")
|
||||
async def create_evidence_pack(
|
||||
taxpayer_id: str,
|
||||
tax_year: str,
|
||||
scope: str,
|
||||
evidence_items: list[dict[str, Any]],
|
||||
background_tasks: BackgroundTasks,
|
||||
current_user: dict[str, Any] = Depends(get_current_user),
|
||||
tenant_id: str = Depends(get_tenant_id),
|
||||
) -> dict[str, Any]:
|
||||
"""Create evidence pack with supporting documents"""
|
||||
|
||||
with tracer.start_as_current_span("create_evidence_pack") as span:
|
||||
span.set_attribute("taxpayer_id", taxpayer_id)
|
||||
span.set_attribute("tax_year", tax_year)
|
||||
span.set_attribute("scope", scope)
|
||||
span.set_attribute("tenant_id", tenant_id)
|
||||
span.set_attribute("evidence_count", len(evidence_items))
|
||||
|
||||
try:
|
||||
# Generate pack ID
|
||||
pack_id = str(ulid.new())
|
||||
span.set_attribute("pack_id", pack_id)
|
||||
|
||||
# Start background pack creation
|
||||
background_tasks.add_task(
|
||||
_create_evidence_pack_async,
|
||||
taxpayer_id,
|
||||
tax_year,
|
||||
scope,
|
||||
evidence_items,
|
||||
tenant_id,
|
||||
pack_id,
|
||||
current_user.get("sub", "system"),
|
||||
)
|
||||
|
||||
logger.info(
|
||||
"Evidence pack creation started",
|
||||
pack_id=pack_id,
|
||||
taxpayer_id=taxpayer_id,
|
||||
scope=scope,
|
||||
)
|
||||
|
||||
return {
|
||||
"pack_id": pack_id,
|
||||
"taxpayer_id": taxpayer_id,
|
||||
"tax_year": tax_year,
|
||||
"scope": scope,
|
||||
"status": "creating",
|
||||
"evidence_count": len(evidence_items),
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Failed to start evidence pack creation", error=str(e))
|
||||
raise HTTPException(
|
||||
status_code=500, detail="Failed to start evidence pack creation"
|
||||
)
|
||||
|
||||
|
||||
@app.get("/forms")
|
||||
async def list_supported_forms(
|
||||
current_user: dict[str, Any] = Depends(get_current_user),
|
||||
tenant_id: str = Depends(get_tenant_id),
|
||||
) -> dict[str, Any]:
|
||||
"""List supported forms with field information"""
|
||||
|
||||
try:
|
||||
forms_info = []
|
||||
|
||||
for form_id in settings.supported_forms:
|
||||
form_config = UK_TAX_FORMS.get(form_id, {})
|
||||
|
||||
# Get form fields if template is loaded
|
||||
fields = []
|
||||
if pdf_form_filler and form_id in pdf_form_filler.form_templates:
|
||||
fields = pdf_form_filler.get_form_fields(form_id)
|
||||
|
||||
forms_info.append(
|
||||
{
|
||||
"form_id": form_id,
|
||||
"name": form_config.get("name", form_id),
|
||||
"template_available": form_id
|
||||
in (pdf_form_filler.form_templates if pdf_form_filler else {}),
|
||||
"field_count": len(fields),
|
||||
"fields": fields[:10], # Limit to first 10 fields for overview
|
||||
}
|
||||
)
|
||||
|
||||
return {"supported_forms": forms_info, "total_forms": len(forms_info)}
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Failed to list forms", error=str(e))
|
||||
raise HTTPException(status_code=500, detail="Failed to list forms")
|
||||
|
||||
|
||||
async def _handle_calculation_ready(topic: str, payload: EventPayload) -> None:
|
||||
"""Handle calculation completion events for auto-form filling"""
|
||||
try:
|
||||
data = payload.data
|
||||
calculation_id = data.get("calculation_id")
|
||||
schedule = data.get("schedule")
|
||||
tenant_id = data.get("tenant_id")
|
||||
|
||||
if not calculation_id or not schedule or not tenant_id:
|
||||
logger.warning("Invalid calculation ready event", data=data)
|
||||
return
|
||||
|
||||
logger.info(
|
||||
"Auto-filling form from calculation",
|
||||
calculation_id=calculation_id,
|
||||
schedule=schedule,
|
||||
)
|
||||
|
||||
# Get form boxes from event data
|
||||
form_boxes = data.get("form_boxes", {})
|
||||
|
||||
# Convert to field values
|
||||
field_values = {}
|
||||
for box_id, box_data in form_boxes.items():
|
||||
field_values[f"box_{box_id}"] = box_data.get("value")
|
||||
|
||||
await _fill_form_async(
|
||||
form_id=schedule,
|
||||
field_values=field_values,
|
||||
tenant_id=tenant_id,
|
||||
filling_id=str(ulid.new()),
|
||||
actor=payload.actor,
|
||||
calculation_id=calculation_id,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Failed to handle calculation ready event", error=str(e))
|
||||
|
||||
|
||||
async def _fill_form_async(
|
||||
form_id: str,
|
||||
field_values: dict[str, Any],
|
||||
tenant_id: str,
|
||||
filling_id: str,
|
||||
actor: str,
|
||||
calculation_id: str | None = None,
|
||||
) -> None:
|
||||
"""Fill form asynchronously"""
|
||||
|
||||
with tracer.start_as_current_span("fill_form_async") as span:
|
||||
span.set_attribute("form_id", form_id)
|
||||
span.set_attribute("filling_id", filling_id)
|
||||
span.set_attribute("tenant_id", tenant_id)
|
||||
|
||||
try:
|
||||
# Fill the form
|
||||
filled_pdf = pdf_form_filler.fill_form(form_id, field_values) # fmt: skip # pyright: ignore[reportOptionalMemberAccess]
|
||||
|
||||
if not filled_pdf:
|
||||
# pylint: disable-next=broad-exception-raised
|
||||
raise Exception("Form filling failed")
|
||||
|
||||
# Store filled form
|
||||
object_key = f"tenants/{tenant_id}/filled/{filling_id}.pdf"
|
||||
|
||||
success = await storage_client.put_object( # fmt: skip # pyright: ignore[reportOptionalMemberAccess]
|
||||
bucket_name=settings.output_bucket,
|
||||
object_name=object_key,
|
||||
data=BytesIO(filled_pdf),
|
||||
length=len(filled_pdf),
|
||||
content_type="application/pdf",
|
||||
metadata={
|
||||
"form_id": form_id,
|
||||
"filling_id": filling_id,
|
||||
"tenant_id": tenant_id,
|
||||
"calculation_id": calculation_id or "",
|
||||
"filled_at": datetime.utcnow().isoformat(),
|
||||
},
|
||||
)
|
||||
|
||||
if not success:
|
||||
# pylint: disable-next=broad-exception-raised
|
||||
raise Exception("Failed to store filled form")
|
||||
|
||||
# Update metrics
|
||||
metrics.counter("forms_filled_total").labels(
|
||||
tenant_id=tenant_id, form_id=form_id
|
||||
).inc()
|
||||
|
||||
# Publish completion event
|
||||
event_payload = EventPayload(
|
||||
data={
|
||||
"filling_id": filling_id,
|
||||
"form_id": form_id,
|
||||
"tenant_id": tenant_id,
|
||||
"calculation_id": calculation_id,
|
||||
"s3_url": f"s3://{settings.output_bucket}/{object_key}",
|
||||
"field_count": len(field_values),
|
||||
},
|
||||
actor=actor,
|
||||
tenant_id=tenant_id,
|
||||
)
|
||||
|
||||
await event_bus.publish(EventTopics.FORM_FILLED, event_payload) # fmt: skip # pyright: ignore[reportOptionalMemberAccess]
|
||||
|
||||
logger.info(
|
||||
"Form filling completed", filling_id=filling_id, form_id=form_id
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Form filling failed", filling_id=filling_id, error=str(e))
|
||||
|
||||
# Update error metrics
|
||||
metrics.counter("form_filling_errors_total").labels(
|
||||
tenant_id=tenant_id, form_id=form_id, error_type=type(e).__name__
|
||||
).inc()
|
||||
|
||||
|
||||
async def _create_evidence_pack_async(
|
||||
taxpayer_id: str,
|
||||
tax_year: str,
|
||||
scope: str,
|
||||
evidence_items: list[dict[str, Any]],
|
||||
tenant_id: str,
|
||||
pack_id: str,
|
||||
actor: str,
|
||||
) -> None:
|
||||
"""Create evidence pack asynchronously"""
|
||||
|
||||
with tracer.start_as_current_span("create_evidence_pack_async") as span:
|
||||
span.set_attribute("pack_id", pack_id)
|
||||
span.set_attribute("taxpayer_id", taxpayer_id)
|
||||
span.set_attribute("scope", scope)
|
||||
|
||||
try:
|
||||
# Create evidence pack
|
||||
pack_result = await evidence_pack_generator.create_evidence_pack( # fmt: skip # pyright: ignore[reportOptionalMemberAccess]
|
||||
taxpayer_id=taxpayer_id,
|
||||
tax_year=tax_year,
|
||||
scope=scope,
|
||||
evidence_items=evidence_items,
|
||||
)
|
||||
|
||||
# Update metrics
|
||||
metrics.counter("evidence_packs_created_total").labels(
|
||||
tenant_id=tenant_id, scope=scope
|
||||
).inc()
|
||||
|
||||
logger.info(
|
||||
"Evidence pack created",
|
||||
pack_id=pack_id,
|
||||
pack_size=pack_result["pack_size"],
|
||||
evidence_count=pack_result["evidence_count"],
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Evidence pack creation failed", pack_id=pack_id, error=str(e))
|
||||
|
||||
|
||||
@app.exception_handler(HTTPException)
|
||||
async def http_exception_handler(request: Request, exc: HTTPException) -> JSONResponse:
|
||||
"""Handle HTTP exceptions with RFC7807 format"""
|
||||
return JSONResponse(
|
||||
status_code=exc.status_code,
|
||||
content=ErrorResponse(
|
||||
type=f"https://httpstatuses.com/{exc.status_code}",
|
||||
title=exc.detail,
|
||||
status=exc.status_code,
|
||||
detail=exc.detail,
|
||||
instance=str(request.url),
|
||||
trace_id="",
|
||||
).model_dump(),
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import uvicorn
|
||||
|
||||
uvicorn.run("main:app", host="0.0.0.0", port=8009, reload=True, log_config=None)
|
||||
37
apps/svc_forms/requirements.txt
Normal file
37
apps/svc_forms/requirements.txt
Normal file
@@ -0,0 +1,37 @@
|
||||
# FastAPI and server
|
||||
fastapi>=0.104.1
|
||||
uvicorn[standard]>=0.24.0
|
||||
pydantic>=2.5.0
|
||||
|
||||
# Service-specific dependencies
|
||||
# PDF form filling
|
||||
pdfrw>=0.4
|
||||
reportlab>=4.0.0
|
||||
|
||||
# PDF processing
|
||||
PyPDF2>=3.0.0
|
||||
pypdf>=3.17.0
|
||||
|
||||
# Image processing for overlays
|
||||
Pillow>=10.1.0
|
||||
|
||||
# ZIP file creation for evidence packs
|
||||
zipfile36>=0.1.3
|
||||
|
||||
# Template processing
|
||||
jinja2>=3.1.0
|
||||
|
||||
# QR code generation
|
||||
qrcode>=7.4.0
|
||||
|
||||
# Barcode generation
|
||||
python-barcode>=0.15.0
|
||||
|
||||
# Font handling
|
||||
fonttools>=4.44.0
|
||||
|
||||
# Additional PDF utilities
|
||||
pdfminer.six>=20231228
|
||||
|
||||
# Document conversion
|
||||
python-docx>=1.1.0
|
||||
54
apps/svc_hmrc/Dockerfile
Normal file
54
apps/svc_hmrc/Dockerfile
Normal file
@@ -0,0 +1,54 @@
|
||||
# Multi-stage build for svc_hmrc
|
||||
FROM python:3.12-slim AS builder
|
||||
|
||||
# Install build dependencies
|
||||
RUN apt-get update && apt-get install -y \
|
||||
build-essential \
|
||||
curl \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Create virtual environment
|
||||
RUN python -m venv /opt/venv
|
||||
ENV PATH="/opt/venv/bin:$PATH"
|
||||
|
||||
# Copy requirements and install dependencies
|
||||
COPY libs/requirements-base.txt /tmp/libs-requirements.txt
|
||||
COPY apps/svc_hmrc/requirements.txt /tmp/requirements.txt
|
||||
RUN pip install --no-cache-dir --upgrade pip && \
|
||||
pip install --no-cache-dir -r /tmp/libs-requirements.txt -r /tmp/requirements.txt
|
||||
|
||||
|
||||
# Production stage
|
||||
FROM python:3.12-slim
|
||||
|
||||
# Install runtime dependencies
|
||||
RUN apt-get update && apt-get install -y \
|
||||
curl \
|
||||
&& rm -rf /var/lib/apt/lists/* \
|
||||
&& groupadd -r appuser \
|
||||
&& useradd -r -g appuser appuser
|
||||
|
||||
# Copy virtual environment from builder
|
||||
COPY --from=builder /opt/venv /opt/venv
|
||||
ENV PATH="/opt/venv/bin:$PATH"
|
||||
|
||||
# Set working directory
|
||||
WORKDIR /app
|
||||
|
||||
# Copy application code
|
||||
COPY libs/ ./libs/
|
||||
COPY apps/svc_hmrc/ ./apps/svc_hmrc/
|
||||
|
||||
# Create non-root user and set permissions
|
||||
RUN chown -R appuser:appuser /app
|
||||
USER appuser
|
||||
|
||||
# Health check
|
||||
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
|
||||
CMD curl -f http://localhost:8000/healthz || exit 1
|
||||
|
||||
# Expose port
|
||||
EXPOSE 8000
|
||||
|
||||
# Run the application
|
||||
CMD ["python", "-m", "uvicorn", "apps.svc_hmrc.main:app", "--host", "0.0.0.0", "--port", "8000"]
|
||||
759
apps/svc_hmrc/main.py
Normal file
759
apps/svc_hmrc/main.py
Normal file
@@ -0,0 +1,759 @@
|
||||
# FILE: apps/svc-hmrc/main.py
|
||||
|
||||
# HMRC submission service with MTD API integration and validation
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import os
|
||||
|
||||
# Import shared libraries
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from typing import Any
|
||||
|
||||
import structlog
|
||||
import ulid
|
||||
from fastapi import BackgroundTasks, Depends, HTTPException, Request
|
||||
from fastapi.responses import JSONResponse
|
||||
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
|
||||
|
||||
from libs.app_factory import create_app
|
||||
from libs.config import (
|
||||
BaseAppSettings,
|
||||
create_event_bus,
|
||||
create_neo4j_client,
|
||||
create_vault_client,
|
||||
)
|
||||
from libs.events import EventBus, EventPayload, EventTopics
|
||||
from libs.neo import Neo4jClient
|
||||
from libs.observability import get_metrics, get_tracer, setup_observability
|
||||
from libs.schemas import ErrorResponse, HMRCSubmissionRequest, HMRCSubmissionResponse
|
||||
from libs.security import VaultTransitHelper, get_current_user, get_tenant_id
|
||||
|
||||
logger = structlog.get_logger()
|
||||
|
||||
|
||||
class HMRCSettings(BaseAppSettings):
|
||||
"""Settings for HMRC service"""
|
||||
|
||||
service_name: str = "svc-hmrc"
|
||||
|
||||
# HMRC API configuration
|
||||
hmrc_base_url: str = "https://api.service.hmrc.gov.uk"
|
||||
hmrc_sandbox_url: str = "https://test-api.service.hmrc.gov.uk"
|
||||
use_sandbox: bool = True
|
||||
|
||||
# OAuth configuration
|
||||
client_id: str = ""
|
||||
client_secret: str = ""
|
||||
redirect_uri: str = "http://localhost:8000/oauth/callback"
|
||||
|
||||
# API endpoints
|
||||
mtd_income_tax_endpoint: str = (
|
||||
"/income-tax/self-assessment/ni/{nino}/uk-property/{taxYear}"
|
||||
)
|
||||
mtd_self_employment_endpoint: str = (
|
||||
"/income-tax/self-assessment/ni/{nino}/self-employment/{businessId}"
|
||||
)
|
||||
|
||||
# Validation
|
||||
max_submission_retries: int = 3
|
||||
submission_timeout: int = 300 # 5 minutes
|
||||
|
||||
|
||||
# Create app and settings
|
||||
app, settings = create_app(
|
||||
service_name="svc-hmrc",
|
||||
title="Tax Agent HMRC Service",
|
||||
description="HMRC submission service with MTD API integration",
|
||||
settings_class=HMRCSettings,
|
||||
)
|
||||
|
||||
# Global clients
|
||||
vault_helper: VaultTransitHelper | None = None
|
||||
neo4j_client: Neo4jClient | None = None
|
||||
event_bus: EventBus | None = None
|
||||
tracer = get_tracer("svc-hmrc")
|
||||
metrics = get_metrics()
|
||||
|
||||
|
||||
@app.on_event("startup")
|
||||
async def startup_event() -> None:
|
||||
"""Initialize service dependencies"""
|
||||
global vault_helper, neo4j_client, event_bus
|
||||
|
||||
logger.info("Starting HMRC service")
|
||||
|
||||
# Setup observability
|
||||
setup_observability(settings)
|
||||
|
||||
# Initialize Vault helper
|
||||
vault_client = create_vault_client(settings)
|
||||
vault_helper = VaultTransitHelper(vault_client, "tax-agent-transit")
|
||||
|
||||
# Initialize Neo4j client
|
||||
neo4j_driver = create_neo4j_client(settings)
|
||||
neo4j_client = Neo4jClient(neo4j_driver)
|
||||
|
||||
# Initialize event bus
|
||||
event_bus = create_event_bus(settings)
|
||||
if not event_bus:
|
||||
raise Exception("Event bus not initialized")
|
||||
|
||||
await event_bus.start()
|
||||
|
||||
# Subscribe to form completion events
|
||||
await event_bus.subscribe(EventTopics.FORM_FILLED, _handle_form_filled) # type: ignore
|
||||
|
||||
logger.info("HMRC service started successfully")
|
||||
|
||||
|
||||
@app.on_event("shutdown")
|
||||
async def shutdown_event() -> None:
|
||||
"""Cleanup service dependencies"""
|
||||
global neo4j_client, event_bus
|
||||
|
||||
logger.info("Shutting down HMRC service")
|
||||
|
||||
if neo4j_client:
|
||||
await neo4j_client.close()
|
||||
|
||||
if event_bus:
|
||||
await event_bus.stop()
|
||||
|
||||
logger.info("HMRC service shutdown complete")
|
||||
|
||||
|
||||
@app.get("/health")
|
||||
async def health_check() -> dict[str, Any]:
|
||||
"""Health check endpoint"""
|
||||
return {
|
||||
"status": "healthy",
|
||||
"service": settings.service_name,
|
||||
"version": settings.service_version,
|
||||
"timestamp": datetime.utcnow().isoformat(),
|
||||
"hmrc_environment": "sandbox" if settings.use_sandbox else "production",
|
||||
}
|
||||
|
||||
|
||||
@app.post("/submit", response_model=HMRCSubmissionResponse)
|
||||
async def submit_to_hmrc(
|
||||
request_data: HMRCSubmissionRequest,
|
||||
background_tasks: BackgroundTasks,
|
||||
current_user: dict[str, Any] = Depends(get_current_user),
|
||||
tenant_id: str = Depends(get_tenant_id),
|
||||
) -> HMRCSubmissionResponse:
|
||||
"""Submit tax return to HMRC"""
|
||||
|
||||
with tracer.start_as_current_span("submit_to_hmrc") as span:
|
||||
span.set_attribute("tax_year", request_data.tax_year)
|
||||
span.set_attribute("taxpayer_id", request_data.taxpayer_id)
|
||||
span.set_attribute("tenant_id", tenant_id)
|
||||
span.set_attribute("dry_run", request_data.dry_run)
|
||||
|
||||
try:
|
||||
# Generate submission ID
|
||||
submission_id = str(ulid.new())
|
||||
span.set_attribute("submission_id", submission_id)
|
||||
|
||||
# Start background submission
|
||||
background_tasks.add_task(
|
||||
_submit_to_hmrc_async,
|
||||
request_data.tax_year,
|
||||
request_data.taxpayer_id,
|
||||
request_data.dry_run,
|
||||
tenant_id,
|
||||
submission_id,
|
||||
current_user.get("sub", "system"),
|
||||
)
|
||||
|
||||
logger.info(
|
||||
"HMRC submission started",
|
||||
submission_id=submission_id,
|
||||
taxpayer_id=request_data.taxpayer_id,
|
||||
dry_run=request_data.dry_run,
|
||||
)
|
||||
|
||||
return HMRCSubmissionResponse(
|
||||
submission_id=submission_id,
|
||||
status="processing",
|
||||
hmrc_reference=None,
|
||||
submission_timestamp=datetime.utcnow(),
|
||||
validation_results={},
|
||||
dry_run=request_data.dry_run,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Failed to start HMRC submission", error=str(e))
|
||||
raise HTTPException(
|
||||
status_code=500, detail="Failed to start HMRC submission"
|
||||
)
|
||||
|
||||
|
||||
@app.get("/submissions/{submission_id}")
|
||||
async def get_submission_status(
|
||||
submission_id: str,
|
||||
current_user: dict[str, Any] = Depends(get_current_user),
|
||||
tenant_id: str = Depends(get_tenant_id),
|
||||
) -> dict[str, Any]:
|
||||
"""Get submission status"""
|
||||
|
||||
with tracer.start_as_current_span("get_submission_status") as span:
|
||||
span.set_attribute("submission_id", submission_id)
|
||||
span.set_attribute("tenant_id", tenant_id)
|
||||
|
||||
try:
|
||||
# Get submission from Neo4j
|
||||
query = """
|
||||
MATCH (s:Submission {submission_id: $submission_id, tenant_id: $tenant_id})
|
||||
WHERE s.retracted_at IS NULL
|
||||
RETURN s
|
||||
"""
|
||||
|
||||
if not neo4j_client:
|
||||
raise Exception("Neo4j client not initialized")
|
||||
|
||||
results = await neo4j_client.run_query( # fmt: skip # pyright: ignore[reportOptionalMemberAccess]
|
||||
query, {"submission_id": submission_id, "tenant_id": tenant_id}
|
||||
)
|
||||
|
||||
if not results:
|
||||
raise HTTPException(status_code=404, detail="Submission not found")
|
||||
|
||||
submission = results[0]["s"]
|
||||
|
||||
return {
|
||||
"submission_id": submission_id,
|
||||
"status": submission.get("status"),
|
||||
"hmrc_reference": submission.get("hmrc_reference"),
|
||||
"submission_timestamp": submission.get("submission_timestamp"),
|
||||
"validation_results": json.loads(
|
||||
submission.get("validation_results", "{}")
|
||||
),
|
||||
"dry_run": submission.get("dry_run", False),
|
||||
"error_message": submission.get("error_message"),
|
||||
}
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
"Failed to get submission status",
|
||||
submission_id=submission_id,
|
||||
error=str(e),
|
||||
)
|
||||
raise HTTPException(
|
||||
status_code=500, detail="Failed to get submission status"
|
||||
)
|
||||
|
||||
|
||||
@app.post("/oauth/authorize")
|
||||
async def initiate_oauth_flow(
|
||||
taxpayer_id: str,
|
||||
current_user: dict[str, Any] = Depends(get_current_user),
|
||||
tenant_id: str = Depends(get_tenant_id),
|
||||
) -> dict[str, Any]:
|
||||
"""Initiate OAuth flow for HMRC authorization"""
|
||||
|
||||
with tracer.start_as_current_span("initiate_oauth") as span:
|
||||
span.set_attribute("taxpayer_id", taxpayer_id)
|
||||
span.set_attribute("tenant_id", tenant_id)
|
||||
|
||||
try:
|
||||
# Generate state parameter for security
|
||||
state = str(ulid.new())
|
||||
|
||||
# Build authorization URL
|
||||
base_url = (
|
||||
settings.hmrc_sandbox_url
|
||||
if settings.use_sandbox
|
||||
else settings.hmrc_base_url
|
||||
)
|
||||
auth_url = f"{base_url}/oauth/authorize"
|
||||
|
||||
params = {
|
||||
"response_type": "code",
|
||||
"client_id": settings.client_id,
|
||||
"scope": "read:self-assessment write:self-assessment",
|
||||
"state": state,
|
||||
"redirect_uri": settings.redirect_uri,
|
||||
}
|
||||
|
||||
# Store state for validation
|
||||
await _store_oauth_state(state, taxpayer_id, tenant_id)
|
||||
|
||||
# Build full URL
|
||||
param_string = "&".join([f"{k}={v}" for k, v in params.items()])
|
||||
full_auth_url = f"{auth_url}?{param_string}"
|
||||
|
||||
return {
|
||||
"authorization_url": full_auth_url,
|
||||
"state": state,
|
||||
"expires_in": 600, # 10 minutes
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Failed to initiate OAuth flow", error=str(e))
|
||||
raise HTTPException(status_code=500, detail="Failed to initiate OAuth flow")
|
||||
|
||||
|
||||
@app.post("/oauth/callback")
|
||||
async def handle_oauth_callback(
|
||||
code: str,
|
||||
state: str,
|
||||
current_user: dict[str, Any] = Depends(get_current_user),
|
||||
tenant_id: str = Depends(get_tenant_id),
|
||||
) -> dict[str, Any]:
|
||||
"""Handle OAuth callback from HMRC"""
|
||||
|
||||
with tracer.start_as_current_span("handle_oauth_callback") as span:
|
||||
span.set_attribute("state", state)
|
||||
span.set_attribute("tenant_id", tenant_id)
|
||||
|
||||
if not neo4j_client:
|
||||
raise HTTPException(status_code=500, detail="Neo4j client not initialized")
|
||||
|
||||
try:
|
||||
# Validate state
|
||||
oauth_data = await _get_oauth_state(state)
|
||||
if not oauth_data or oauth_data.get("tenant_id") != tenant_id:
|
||||
raise HTTPException(status_code=400, detail="Invalid state parameter")
|
||||
|
||||
# Exchange code for access token
|
||||
token_data = await _exchange_code_for_token(code)
|
||||
|
||||
# Store encrypted tokens
|
||||
if vault_helper is None:
|
||||
raise HTTPException(
|
||||
status_code=500, detail="Vault helper not initialized"
|
||||
)
|
||||
|
||||
encrypted_access_token = vault_helper.encrypt_field(
|
||||
"hmrc-access-token", token_data["access_token"]
|
||||
)
|
||||
encrypted_refresh_token = vault_helper.encrypt_field(
|
||||
"hmrc-refresh-token", token_data.get("refresh_token", "")
|
||||
)
|
||||
|
||||
# Store authorization in Neo4j
|
||||
auth_properties = {
|
||||
"taxpayer_id": oauth_data["taxpayer_id"],
|
||||
"tenant_id": tenant_id,
|
||||
"access_token": encrypted_access_token,
|
||||
"refresh_token": encrypted_refresh_token,
|
||||
"expires_at": datetime.utcnow().timestamp()
|
||||
+ token_data.get("expires_in", 3600),
|
||||
"scope": token_data.get("scope", ""),
|
||||
"authorized_at": datetime.utcnow().isoformat(),
|
||||
"source": "oauth_flow",
|
||||
"extractor_version": "1.0.0",
|
||||
"valid_from": datetime.utcnow(),
|
||||
"asserted_at": datetime.utcnow(),
|
||||
}
|
||||
|
||||
await neo4j_client.create_node("HMRCAuthorization", auth_properties) # fmt: skip # pyright: ignore[reportOptionalMemberAccess]
|
||||
|
||||
# Clean up state
|
||||
await _delete_oauth_state(state)
|
||||
|
||||
return {
|
||||
"status": "authorized",
|
||||
"taxpayer_id": oauth_data["taxpayer_id"],
|
||||
"scope": token_data.get("scope", ""),
|
||||
"expires_in": token_data.get("expires_in", 3600),
|
||||
}
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error("OAuth callback failed", error=str(e))
|
||||
raise HTTPException(status_code=500, detail="OAuth callback failed")
|
||||
|
||||
|
||||
async def _handle_form_filled(topic: str, payload: EventPayload) -> None:
|
||||
"""Handle form completion events for auto-submission"""
|
||||
try:
|
||||
if not neo4j_client:
|
||||
raise Exception("Neo4j client not initialized")
|
||||
|
||||
data = payload.data
|
||||
form_id = data.get("form_id")
|
||||
tenant_id = data.get("tenant_id")
|
||||
calculation_id = data.get("calculation_id")
|
||||
|
||||
if not form_id or not tenant_id:
|
||||
logger.warning("Invalid form filled event", data=data)
|
||||
return
|
||||
|
||||
# Only auto-submit if configured (this would be a tenant setting)
|
||||
auto_submit = False # Default to false for safety
|
||||
|
||||
if auto_submit and calculation_id:
|
||||
logger.info(
|
||||
"Auto-submitting form to HMRC",
|
||||
form_id=form_id,
|
||||
calculation_id=calculation_id,
|
||||
)
|
||||
|
||||
# Get taxpayer ID from calculation
|
||||
calc_query = """
|
||||
MATCH (c:Calculation {calculation_id: $calculation_id})
|
||||
WHERE c.retracted_at IS NULL
|
||||
RETURN c.taxpayer_id as taxpayer_id, c.tax_year as tax_year
|
||||
"""
|
||||
|
||||
calc_results = await neo4j_client.run_query( # fmt: skip # pyright: ignore[reportOptionalMemberAccess]
|
||||
calc_query, {"calculation_id": calculation_id}
|
||||
)
|
||||
|
||||
if calc_results:
|
||||
taxpayer_id = calc_results[0]["taxpayer_id"]
|
||||
tax_year = calc_results[0]["tax_year"]
|
||||
|
||||
await _submit_to_hmrc_async(
|
||||
tax_year=tax_year,
|
||||
taxpayer_id=taxpayer_id,
|
||||
dry_run=True, # Always dry run for auto-submission
|
||||
tenant_id=tenant_id,
|
||||
submission_id=str(ulid.new()),
|
||||
actor=payload.actor,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Failed to handle form filled event", error=str(e))
|
||||
|
||||
|
||||
async def _submit_to_hmrc_async(
|
||||
tax_year: str,
|
||||
taxpayer_id: str,
|
||||
dry_run: bool,
|
||||
tenant_id: str,
|
||||
submission_id: str,
|
||||
actor: str,
|
||||
) -> None:
|
||||
"""Submit to HMRC asynchronously"""
|
||||
|
||||
with tracer.start_as_current_span("submit_to_hmrc_async") as span:
|
||||
span.set_attribute("submission_id", submission_id)
|
||||
span.set_attribute("taxpayer_id", taxpayer_id)
|
||||
span.set_attribute("dry_run", dry_run)
|
||||
|
||||
if not event_bus:
|
||||
raise Exception("Event bus not initialized")
|
||||
|
||||
try:
|
||||
# Get taxpayer data
|
||||
taxpayer_data = await _get_taxpayer_data(taxpayer_id, tenant_id)
|
||||
|
||||
# Get calculation data
|
||||
calculation_data = await _get_latest_calculation(
|
||||
taxpayer_id, tax_year, tenant_id
|
||||
)
|
||||
|
||||
# Validate data
|
||||
validation_results = await _validate_submission_data(
|
||||
taxpayer_data, calculation_data
|
||||
)
|
||||
|
||||
# Prepare submission
|
||||
submission_data = await _prepare_submission_data(
|
||||
taxpayer_data, calculation_data, tax_year
|
||||
)
|
||||
|
||||
# Submit to HMRC (or simulate if dry run)
|
||||
if dry_run:
|
||||
hmrc_response = await _simulate_hmrc_submission(submission_data)
|
||||
else:
|
||||
hmrc_response = await _submit_to_hmrc_api(
|
||||
submission_data, taxpayer_id, tenant_id
|
||||
)
|
||||
|
||||
# Store submission record
|
||||
await _store_submission_record(
|
||||
submission_id,
|
||||
taxpayer_id,
|
||||
tax_year,
|
||||
tenant_id,
|
||||
hmrc_response,
|
||||
validation_results,
|
||||
dry_run,
|
||||
)
|
||||
|
||||
# Update metrics
|
||||
metrics.counter("hmrc_submissions_total").labels(
|
||||
tenant_id=tenant_id,
|
||||
dry_run=str(dry_run),
|
||||
status=hmrc_response.get("status", "unknown"),
|
||||
).inc()
|
||||
|
||||
# Publish completion event
|
||||
event_payload = EventPayload(
|
||||
data={
|
||||
"submission_id": submission_id,
|
||||
"taxpayer_id": taxpayer_id,
|
||||
"tax_year": tax_year,
|
||||
"tenant_id": tenant_id,
|
||||
"status": hmrc_response.get("status"),
|
||||
"hmrc_reference": hmrc_response.get("reference"),
|
||||
"dry_run": dry_run,
|
||||
},
|
||||
actor=actor,
|
||||
tenant_id=tenant_id,
|
||||
)
|
||||
|
||||
await event_bus.publish(EventTopics.HMRC_SUBMITTED, event_payload) # fmt: skip # pyright: ignore[reportOptionalMemberAccess]
|
||||
|
||||
logger.info(
|
||||
"HMRC submission completed",
|
||||
submission_id=submission_id,
|
||||
status=hmrc_response.get("status"),
|
||||
dry_run=dry_run,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
"HMRC submission failed", submission_id=submission_id, error=str(e)
|
||||
)
|
||||
|
||||
# Store error record
|
||||
await _store_submission_error(submission_id, str(e), tenant_id)
|
||||
|
||||
# Update error metrics
|
||||
metrics.counter("hmrc_submission_errors_total").labels(
|
||||
tenant_id=tenant_id, error_type=type(e).__name__
|
||||
).inc()
|
||||
|
||||
|
||||
async def _get_taxpayer_data(taxpayer_id: str, tenant_id: str) -> dict[str, Any]:
|
||||
"""Get taxpayer data from knowledge graph"""
|
||||
|
||||
query = """
|
||||
MATCH (t:TaxpayerProfile {taxpayer_id: $taxpayer_id, tenant_id: $tenant_id})
|
||||
WHERE t.retracted_at IS NULL
|
||||
RETURN t
|
||||
"""
|
||||
if not neo4j_client:
|
||||
raise Exception("Neo4j client not initialized")
|
||||
|
||||
results = await neo4j_client.run_query(
|
||||
query, {"taxpayer_id": taxpayer_id, "tenant_id": tenant_id}
|
||||
)
|
||||
|
||||
if not results:
|
||||
raise Exception(f"Taxpayer not found: {taxpayer_id}")
|
||||
|
||||
return results[0]["t"]
|
||||
|
||||
|
||||
async def _get_latest_calculation(
|
||||
taxpayer_id: str, tax_year: str, tenant_id: str
|
||||
) -> dict[str, Any]:
|
||||
"""Get latest calculation for taxpayer and tax year"""
|
||||
|
||||
query = """
|
||||
MATCH (c:Calculation {taxpayer_id: $taxpayer_id, tax_year: $tax_year, tenant_id: $tenant_id})
|
||||
WHERE c.retracted_at IS NULL
|
||||
RETURN c
|
||||
ORDER BY c.calculated_at DESC
|
||||
LIMIT 1
|
||||
"""
|
||||
|
||||
if not neo4j_client:
|
||||
raise Exception("Neo4j client not initialized")
|
||||
|
||||
results = await neo4j_client.run_query( # fmt: skip # pyright: ignore[reportOptionalMemberAccess]
|
||||
query,
|
||||
{"taxpayer_id": taxpayer_id, "tax_year": tax_year, "tenant_id": tenant_id},
|
||||
)
|
||||
|
||||
if not results:
|
||||
raise Exception(
|
||||
f"No calculation found for taxpayer {taxpayer_id} and tax year {tax_year}"
|
||||
)
|
||||
|
||||
return results[0]["c"]
|
||||
|
||||
|
||||
async def _validate_submission_data(
|
||||
taxpayer_data: dict[str, Any], calculation_data: dict[str, Any]
|
||||
) -> dict[str, Any]:
|
||||
"""Validate submission data"""
|
||||
|
||||
validation_results: dict[str, bool | list[str]] = {
|
||||
"valid": True,
|
||||
"errors": [],
|
||||
"warnings": [],
|
||||
}
|
||||
|
||||
# Check required taxpayer fields
|
||||
if not taxpayer_data.get("utr"):
|
||||
validation_results["errors"].append("UTR is required")
|
||||
validation_results["valid"] = False
|
||||
|
||||
if not taxpayer_data.get("ni_number"):
|
||||
validation_results["errors"].append("National Insurance number is required")
|
||||
validation_results["valid"] = False
|
||||
|
||||
# Check calculation data
|
||||
if not calculation_data.get("schedule"):
|
||||
validation_results["errors"].append("Schedule is required")
|
||||
validation_results["valid"] = False
|
||||
|
||||
return validation_results
|
||||
|
||||
|
||||
async def _prepare_submission_data(
|
||||
taxpayer_data: dict[str, Any], calculation_data: dict[str, Any], tax_year: str
|
||||
) -> dict[str, Any]:
|
||||
"""Prepare data for HMRC submission"""
|
||||
|
||||
# This would format data according to HMRC MTD API requirements
|
||||
submission_data = {
|
||||
"taxYear": tax_year,
|
||||
"nino": taxpayer_data.get("ni_number"),
|
||||
"utr": taxpayer_data.get("utr"),
|
||||
"schedule": calculation_data.get("schedule"),
|
||||
"submissionTimestamp": datetime.utcnow().isoformat(),
|
||||
}
|
||||
|
||||
return submission_data
|
||||
|
||||
|
||||
async def _simulate_hmrc_submission(submission_data: dict[str, Any]) -> dict[str, Any]:
|
||||
"""Simulate HMRC submission for dry run"""
|
||||
|
||||
# Simulate processing delay
|
||||
await asyncio.sleep(1)
|
||||
|
||||
return {
|
||||
"status": "accepted",
|
||||
"reference": f"DRY_RUN_{ulid.new()}",
|
||||
"timestamp": datetime.utcnow().isoformat(),
|
||||
"dry_run": True,
|
||||
}
|
||||
|
||||
|
||||
async def _submit_to_hmrc_api(
|
||||
submission_data: dict[str, Any], taxpayer_id: str, tenant_id: str
|
||||
) -> dict[str, Any]:
|
||||
"""Submit to actual HMRC API"""
|
||||
|
||||
# This would implement the actual HMRC MTD API calls
|
||||
# For now, return mock response
|
||||
logger.warning("Actual HMRC API submission not implemented")
|
||||
|
||||
return {
|
||||
"status": "not_implemented",
|
||||
"reference": None,
|
||||
"timestamp": datetime.utcnow().isoformat(),
|
||||
"error": "HMRC API integration not implemented",
|
||||
}
|
||||
|
||||
|
||||
async def _store_submission_record(
|
||||
submission_id: str,
|
||||
taxpayer_id: str,
|
||||
tax_year: str,
|
||||
tenant_id: str,
|
||||
hmrc_response: dict[str, Any],
|
||||
validation_results: dict[str, Any],
|
||||
dry_run: bool,
|
||||
) -> None:
|
||||
"""Store submission record in knowledge graph"""
|
||||
|
||||
submission_properties = {
|
||||
"submission_id": submission_id,
|
||||
"taxpayer_id": taxpayer_id,
|
||||
"tax_year": tax_year,
|
||||
"tenant_id": tenant_id,
|
||||
"status": hmrc_response.get("status"),
|
||||
"hmrc_reference": hmrc_response.get("reference"),
|
||||
"submission_timestamp": hmrc_response.get("timestamp"),
|
||||
"validation_results": json.dumps(validation_results),
|
||||
"dry_run": dry_run,
|
||||
"source": "hmrc_service",
|
||||
"extractor_version": "1.0.0",
|
||||
"valid_from": datetime.utcnow(),
|
||||
"asserted_at": datetime.utcnow(),
|
||||
}
|
||||
if not neo4j_client:
|
||||
raise Exception("Neo4j client not initialized")
|
||||
|
||||
await neo4j_client.create_node("Submission", submission_properties) # fmt: skip # pyright: ignore[reportOptionalMemberAccess]
|
||||
|
||||
|
||||
async def _store_submission_error(
|
||||
submission_id: str, error_message: str, tenant_id: str
|
||||
) -> None:
|
||||
"""Store submission error"""
|
||||
|
||||
error_properties = {
|
||||
"submission_id": submission_id,
|
||||
"tenant_id": tenant_id,
|
||||
"status": "error",
|
||||
"error_message": error_message,
|
||||
"submission_timestamp": datetime.utcnow().isoformat(),
|
||||
"source": "hmrc_service",
|
||||
"extractor_version": "1.0.0",
|
||||
"valid_from": datetime.utcnow(),
|
||||
"asserted_at": datetime.utcnow(),
|
||||
}
|
||||
if not neo4j_client:
|
||||
raise Exception("Neo4j client not initialized")
|
||||
|
||||
await neo4j_client.create_node("Submission", error_properties) # fmt: skip # pyright: ignore[reportOptionalMemberAccess]
|
||||
|
||||
|
||||
async def _store_oauth_state(state: str, taxpayer_id: str, tenant_id: str) -> None:
|
||||
"""Store OAuth state temporarily"""
|
||||
# This would use Redis or similar for temporary storage
|
||||
# For now, just log
|
||||
logger.debug("OAuth state stored", state=state, taxpayer_id=taxpayer_id)
|
||||
|
||||
|
||||
async def _get_oauth_state(state: str) -> dict[str, Any] | None:
|
||||
"""Get OAuth state"""
|
||||
# This would retrieve from Redis
|
||||
# For now, return mock data
|
||||
return {"taxpayer_id": "test_taxpayer", "tenant_id": "test_tenant"}
|
||||
|
||||
|
||||
async def _delete_oauth_state(state: str) -> None:
|
||||
"""Delete OAuth state"""
|
||||
# This would delete from Redis
|
||||
logger.debug("OAuth state deleted", state=state)
|
||||
|
||||
|
||||
async def _exchange_code_for_token(code: str) -> dict[str, Any]:
|
||||
"""Exchange authorization code for access token"""
|
||||
# This would call HMRC token endpoint
|
||||
# For now, return mock token
|
||||
return {
|
||||
"access_token": "mock_access_token",
|
||||
"refresh_token": "mock_refresh_token",
|
||||
"expires_in": 3600,
|
||||
"scope": "read:self-assessment write:self-assessment",
|
||||
}
|
||||
|
||||
|
||||
@app.exception_handler(HTTPException)
|
||||
async def http_exception_handler(request: Request, exc: HTTPException) -> JSONResponse:
|
||||
"""Handle HTTP exceptions with RFC7807 format"""
|
||||
return JSONResponse(
|
||||
status_code=exc.status_code,
|
||||
content=ErrorResponse(
|
||||
type=f"https://httpstatuses.com/{exc.status_code}",
|
||||
title=exc.detail,
|
||||
status=exc.status_code,
|
||||
detail=exc.detail,
|
||||
instance=str(request.url),
|
||||
trace_id=getattr(request.state, "trace_id", None),
|
||||
).model_dump(),
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import uvicorn
|
||||
|
||||
uvicorn.run("main:app", host="0.0.0.0", port=8010, reload=True, log_config=None)
|
||||
40
apps/svc_hmrc/requirements.txt
Normal file
40
apps/svc_hmrc/requirements.txt
Normal file
@@ -0,0 +1,40 @@
|
||||
# FastAPI and server
|
||||
fastapi>=0.104.1
|
||||
uvicorn[standard]>=0.24.0
|
||||
pydantic>=2.5.0
|
||||
|
||||
# Service-specific dependencies
|
||||
# OAuth and authentication
|
||||
authlib>=1.2.0
|
||||
oauthlib>=3.2.0
|
||||
|
||||
# HTTP client with OAuth support
|
||||
requests-oauthlib>=1.3.0
|
||||
|
||||
# XML processing for HMRC APIs
|
||||
lxml>=4.9.0
|
||||
xmltodict>=0.13.0
|
||||
|
||||
# JSON Web Tokens
|
||||
pyjwt>=2.8.0
|
||||
|
||||
# UK government API utilities
|
||||
govuk-frontend-jinja>=2.8.0
|
||||
|
||||
# Date and time for tax years
|
||||
python-dateutil>=2.8.0
|
||||
|
||||
# Retry mechanisms
|
||||
tenacity>=8.2.0
|
||||
|
||||
# Rate limiting
|
||||
ratelimit>=2.2.0
|
||||
|
||||
# API validation
|
||||
marshmallow>=3.20.0
|
||||
|
||||
# Encryption for sensitive data
|
||||
cryptography>=41.0.0
|
||||
|
||||
# Additional HTTP utilities
|
||||
urllib3>=2.1.0
|
||||
54
apps/svc_ingestion/Dockerfile
Normal file
54
apps/svc_ingestion/Dockerfile
Normal file
@@ -0,0 +1,54 @@
|
||||
# Multi-stage build for svc_ingestion
|
||||
FROM python:3.12-slim AS builder
|
||||
|
||||
# Install build dependencies
|
||||
RUN apt-get update && apt-get install -y \
|
||||
build-essential \
|
||||
curl \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Create virtual environment
|
||||
RUN python -m venv /opt/venv
|
||||
ENV PATH="/opt/venv/bin:$PATH"
|
||||
|
||||
# Copy requirements and install dependencies
|
||||
# Use base requirements (no ML dependencies for ingestion service)
|
||||
COPY libs/requirements-base.txt /tmp/libs-requirements.txt
|
||||
COPY apps/svc_ingestion/requirements.txt /tmp/requirements.txt
|
||||
RUN pip install --no-cache-dir --upgrade pip && \
|
||||
pip install --no-cache-dir -r /tmp/libs-requirements.txt -r /tmp/requirements.txt
|
||||
|
||||
# Production stage
|
||||
FROM python:3.12-slim
|
||||
|
||||
# Install runtime dependencies
|
||||
RUN apt-get update && apt-get install -y \
|
||||
curl \
|
||||
&& rm -rf /var/lib/apt/lists/* \
|
||||
&& groupadd -r appuser \
|
||||
&& useradd -r -g appuser appuser
|
||||
|
||||
# Copy virtual environment from builder
|
||||
COPY --from=builder /opt/venv /opt/venv
|
||||
ENV PATH="/opt/venv/bin:$PATH"
|
||||
|
||||
# Set working directory
|
||||
WORKDIR /app
|
||||
|
||||
# Copy application code
|
||||
COPY libs/ ./libs/
|
||||
COPY apps/svc_ingestion/ ./apps/svc_ingestion/
|
||||
|
||||
# Create non-root user and set permissions
|
||||
RUN chown -R appuser:appuser /app
|
||||
USER appuser
|
||||
|
||||
# Health check
|
||||
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
|
||||
CMD curl -f http://localhost:8000/healthz || exit 1
|
||||
|
||||
# Expose port
|
||||
EXPOSE 8000
|
||||
|
||||
# Run the application
|
||||
CMD ["python", "-m", "uvicorn", "apps.svc_ingestion.main:app", "--host", "0.0.0.0", "--port", "8000"]
|
||||
10
apps/svc_ingestion/docker.env
Normal file
10
apps/svc_ingestion/docker.env
Normal file
@@ -0,0 +1,10 @@
|
||||
# FILE: apps/svc_ingestion/docker.env
|
||||
VAULT_ADDR=http://vault:8200
|
||||
VAULT_TOKEN=${VAULT_DEV_ROOT_TOKEN_ID:-root}
|
||||
MINIO_ENDPOINT=minio:9092
|
||||
POSTGRES_URL=postgresql://postgres:${POSTGRES_PASSWORD:-postgres}@postgres:5432/tax_system
|
||||
REDIS_URL=redis://redis:6379
|
||||
EVENT_BUS_TYPE=${EVENT_BUS_TYPE:-memory}
|
||||
NATS_SERVERS=${NATS_SERVERS:-nats://nats:4222}
|
||||
NATS_STREAM_NAME=${NATS_STREAM_NAME:-TAX_AGENT_EVENTS}
|
||||
NATS_CONSUMER_GROUP=${NATS_CONSUMER_GROUP:-tax-agent}
|
||||
351
apps/svc_ingestion/main.py
Normal file
351
apps/svc_ingestion/main.py
Normal file
@@ -0,0 +1,351 @@
|
||||
"""Document upload, storage, checksum validation, metadata extraction service."""
|
||||
|
||||
import hashlib
|
||||
import mimetypes
|
||||
import os
|
||||
|
||||
# Import shared libraries
|
||||
import sys
|
||||
from datetime import UTC, datetime
|
||||
from typing import Any, cast
|
||||
|
||||
import structlog
|
||||
import ulid
|
||||
from fastapi import Depends, File, HTTPException, Request, UploadFile
|
||||
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
|
||||
|
||||
from libs.app_factory import create_app, get_tenant_dependency, get_user_dependency
|
||||
from libs.config import BaseAppSettings, create_event_bus, create_minio_client
|
||||
from libs.events import EventBus, EventPayload, EventTopics
|
||||
from libs.observability import get_metrics, get_tracer
|
||||
from libs.schemas import DocumentKind, DocumentUploadResponse
|
||||
from libs.storage import DocumentStorage, StorageClient
|
||||
|
||||
logger = structlog.get_logger()
|
||||
|
||||
|
||||
class IngestionSettings(BaseAppSettings):
|
||||
"""Settings for ingestion service"""
|
||||
|
||||
service_name: str = "svc-ingestion"
|
||||
|
||||
# File upload limits
|
||||
max_file_size: int = 50 * 1024 * 1024 # 50MB
|
||||
allowed_mime_types: list[str] = [
|
||||
"application/pdf",
|
||||
"image/jpeg",
|
||||
"image/png",
|
||||
"image/tiff",
|
||||
"text/csv",
|
||||
"application/vnd.ms-excel",
|
||||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
||||
]
|
||||
|
||||
# Storage configuration
|
||||
raw_documents_bucket: str = "raw-documents"
|
||||
evidence_bucket: str = "evidence"
|
||||
|
||||
|
||||
# Global clients (will be initialized in startup)
|
||||
storage_client: StorageClient | None = None
|
||||
document_storage: DocumentStorage | None = None
|
||||
event_bus: EventBus | None = None
|
||||
|
||||
# Settings will be initialized after app creation
|
||||
settings: IngestionSettings
|
||||
|
||||
|
||||
def init_dependencies(app_settings: IngestionSettings) -> None:
|
||||
"""Initialize service dependencies"""
|
||||
global storage_client, document_storage, event_bus, settings
|
||||
|
||||
settings = app_settings
|
||||
logger.info(
|
||||
"Starting ingestion service",
|
||||
minio_endpoint=settings.minio_endpoint,
|
||||
minio_access_key=settings.minio_access_key,
|
||||
)
|
||||
|
||||
# Initialize clients
|
||||
minio_client = create_minio_client(settings)
|
||||
storage_client = StorageClient(minio_client)
|
||||
document_storage = DocumentStorage(storage_client)
|
||||
event_bus = create_event_bus(settings)
|
||||
|
||||
logger.info("Ingestion service started successfully")
|
||||
|
||||
|
||||
# Create app and settings
|
||||
app, _settings = create_app(
|
||||
service_name="svc-ingestion",
|
||||
title="Tax Agent Ingestion Service",
|
||||
description="Document upload and storage service",
|
||||
settings_class=IngestionSettings,
|
||||
)
|
||||
|
||||
# Initialize dependencies immediately
|
||||
init_dependencies(cast(IngestionSettings, _settings))
|
||||
|
||||
# Get observability components
|
||||
tracer = get_tracer("svc-ingestion")
|
||||
metrics = get_metrics("svc-ingestion")
|
||||
|
||||
|
||||
# Health endpoints are provided by app_factory
|
||||
|
||||
|
||||
@app.post("/upload", response_model=DocumentUploadResponse)
|
||||
async def upload_document(
|
||||
request: Request,
|
||||
file: UploadFile = File(...),
|
||||
kind: DocumentKind = DocumentKind.INVOICE,
|
||||
source: str = "manual_upload",
|
||||
current_user: dict[str, Any] = Depends(get_user_dependency()),
|
||||
tenant_id: str = Depends(get_tenant_dependency()),
|
||||
) -> DocumentUploadResponse:
|
||||
"""Upload document for processing"""
|
||||
|
||||
# Check if services are initialized
|
||||
if document_storage is None or event_bus is None:
|
||||
raise HTTPException(
|
||||
status_code=503, detail="Service not ready - dependencies not initialized"
|
||||
)
|
||||
|
||||
with tracer.start_as_current_span("upload_document") as span:
|
||||
span.set_attribute("tenant_id", tenant_id)
|
||||
span.set_attribute("document_kind", kind.value)
|
||||
span.set_attribute("source", source)
|
||||
|
||||
try:
|
||||
# Validate file
|
||||
await _validate_upload(file)
|
||||
|
||||
# Generate document ID
|
||||
doc_id = f"doc_{ulid.new()}"
|
||||
span.set_attribute("doc_id", doc_id)
|
||||
|
||||
# Read file content
|
||||
content = await file.read()
|
||||
|
||||
# Calculate checksum
|
||||
checksum = hashlib.sha256(content).hexdigest()
|
||||
|
||||
# Detect MIME type
|
||||
detected_mime = None
|
||||
if file.filename:
|
||||
detected_mime = mimetypes.guess_type(file.filename)[0]
|
||||
content_type = (
|
||||
detected_mime or file.content_type or "application/octet-stream"
|
||||
)
|
||||
|
||||
# Store document
|
||||
storage_result = await document_storage.store_document(
|
||||
tenant_id=tenant_id,
|
||||
doc_id=doc_id,
|
||||
content=content,
|
||||
content_type=content_type,
|
||||
metadata={
|
||||
"original_filename": file.filename or "unknown",
|
||||
"kind": kind.value,
|
||||
"source": source,
|
||||
"uploaded_by": current_user.get("sub", "unknown"),
|
||||
"uploaded_at": datetime.now(UTC).isoformat(),
|
||||
},
|
||||
)
|
||||
|
||||
# Publish event
|
||||
event_payload = EventPayload(
|
||||
data={
|
||||
"doc_id": doc_id,
|
||||
"tenant_id": tenant_id,
|
||||
"kind": kind.value,
|
||||
"source": source,
|
||||
"checksum": checksum,
|
||||
"file_size": len(content),
|
||||
"content_type": content_type,
|
||||
"s3_url": storage_result["s3_url"],
|
||||
},
|
||||
actor=current_user.get("sub", "system"),
|
||||
tenant_id=tenant_id,
|
||||
trace_id=str(span.get_span_context().trace_id),
|
||||
)
|
||||
|
||||
await event_bus.publish(EventTopics.DOC_INGESTED, event_payload)
|
||||
|
||||
# Update metrics
|
||||
metrics.counter(
|
||||
"documents_uploaded_total", labelnames=["tenant_id", "kind", "source"]
|
||||
).labels(tenant_id=tenant_id, kind=kind.value, source=source).inc()
|
||||
|
||||
metrics.histogram(
|
||||
"document_size_bytes", labelnames=["tenant_id", "kind"]
|
||||
).labels(tenant_id=tenant_id, kind=kind.value).observe(len(content))
|
||||
|
||||
logger.info(
|
||||
"Document uploaded successfully",
|
||||
doc_id=doc_id,
|
||||
tenant_id=tenant_id,
|
||||
kind=kind.value,
|
||||
size=len(content),
|
||||
checksum=checksum,
|
||||
)
|
||||
|
||||
return DocumentUploadResponse(
|
||||
doc_id=doc_id, s3_url=storage_result["s3_url"], checksum=checksum
|
||||
)
|
||||
|
||||
except ValueError as e:
|
||||
logger.warning("Upload validation failed", error=str(e))
|
||||
# Track validation errors
|
||||
try:
|
||||
metrics.counter(
|
||||
"upload_errors_total", labelnames=["tenant_id", "error_type"]
|
||||
).labels(tenant_id=tenant_id, error_type="ValueError").inc()
|
||||
except Exception:
|
||||
pass # Don't fail on metrics errors
|
||||
raise HTTPException(status_code=400, detail=str(e))
|
||||
except Exception as e:
|
||||
logger.error("Upload failed", error=str(e))
|
||||
# Track upload errors
|
||||
try:
|
||||
metrics.counter(
|
||||
"upload_errors_total", labelnames=["tenant_id", "error_type"]
|
||||
).labels(tenant_id=tenant_id, error_type=type(e).__name__).inc()
|
||||
except Exception:
|
||||
pass # Don't fail on metrics errors
|
||||
raise HTTPException(status_code=500, detail="Upload failed")
|
||||
|
||||
|
||||
@app.get("/documents/{doc_id}")
|
||||
async def get_document_info(
|
||||
doc_id: str,
|
||||
current_user: dict[str, Any] = Depends(get_user_dependency()),
|
||||
tenant_id: str = Depends(get_tenant_dependency()),
|
||||
) -> dict[str, str]:
|
||||
"""Get document information"""
|
||||
|
||||
# Check if services are initialized
|
||||
if storage_client is None:
|
||||
raise HTTPException(
|
||||
status_code=503, detail="Service not ready - dependencies not initialized"
|
||||
)
|
||||
|
||||
with tracer.start_as_current_span("get_document_info") as span:
|
||||
span.set_attribute("doc_id", doc_id)
|
||||
span.set_attribute("tenant_id", tenant_id)
|
||||
|
||||
try:
|
||||
# Check if document exists
|
||||
ingestion_settings = cast(IngestionSettings, settings)
|
||||
bucket_name = ingestion_settings.raw_documents_bucket
|
||||
object_key = f"tenants/{tenant_id}/raw/{doc_id}.pdf"
|
||||
|
||||
exists = await storage_client.object_exists(bucket_name, object_key)
|
||||
|
||||
if not exists:
|
||||
raise HTTPException(status_code=404, detail="Document not found")
|
||||
|
||||
# Get presigned URL for download
|
||||
download_url = await storage_client.get_presigned_url(
|
||||
bucket_name=bucket_name, object_name=object_key, method="GET"
|
||||
)
|
||||
|
||||
if not download_url:
|
||||
raise HTTPException(
|
||||
status_code=500, detail="Failed to generate download URL"
|
||||
)
|
||||
|
||||
return {
|
||||
"doc_id": doc_id,
|
||||
"download_url": download_url,
|
||||
"s3_url": f"s3://{bucket_name}/{object_key}",
|
||||
}
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error("Failed to get document info", doc_id=doc_id, error=str(e))
|
||||
raise HTTPException(status_code=500, detail="Failed to get document info")
|
||||
|
||||
|
||||
@app.delete("/documents/{doc_id}")
|
||||
async def delete_document(
|
||||
doc_id: str,
|
||||
current_user: dict[str, Any] = Depends(get_user_dependency()),
|
||||
tenant_id: str = Depends(get_tenant_dependency()),
|
||||
) -> dict[str, str]:
|
||||
"""Delete document"""
|
||||
|
||||
# Check if services are initialized
|
||||
if storage_client is None:
|
||||
raise HTTPException(
|
||||
status_code=503, detail="Service not ready - dependencies not initialized"
|
||||
)
|
||||
|
||||
with tracer.start_as_current_span("delete_document") as span:
|
||||
span.set_attribute("doc_id", doc_id)
|
||||
span.set_attribute("tenant_id", tenant_id)
|
||||
|
||||
try:
|
||||
# Delete from storage
|
||||
ingestion_settings = cast(IngestionSettings, settings)
|
||||
bucket_name = ingestion_settings.raw_documents_bucket
|
||||
object_key = f"tenants/{tenant_id}/raw/{doc_id}.pdf"
|
||||
|
||||
success = await storage_client.delete_object(bucket_name, object_key)
|
||||
|
||||
if not success:
|
||||
raise HTTPException(status_code=404, detail="Document not found")
|
||||
|
||||
logger.info("Document deleted", doc_id=doc_id, tenant_id=tenant_id)
|
||||
|
||||
return {"message": "Document deleted successfully"}
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error("Failed to delete document", doc_id=doc_id, error=str(e))
|
||||
raise HTTPException(status_code=500, detail="Failed to delete document")
|
||||
|
||||
|
||||
async def _validate_upload(file: UploadFile) -> None:
|
||||
"""Validate uploaded file"""
|
||||
|
||||
# Cast settings to the correct type
|
||||
ingestion_settings = cast(IngestionSettings, settings)
|
||||
|
||||
# Check file size
|
||||
if file.size and file.size > ingestion_settings.max_file_size:
|
||||
raise ValueError(
|
||||
f"File too large: {file.size} bytes (max: {ingestion_settings.max_file_size})"
|
||||
)
|
||||
|
||||
# Check MIME type
|
||||
if file.content_type not in ingestion_settings.allowed_mime_types:
|
||||
# Try to detect MIME type from filename
|
||||
detected_mime = None
|
||||
if file.filename:
|
||||
detected_mime = mimetypes.guess_type(file.filename)[0]
|
||||
if detected_mime not in ingestion_settings.allowed_mime_types:
|
||||
raise ValueError(f"Unsupported file type: {file.content_type}")
|
||||
|
||||
# Check filename
|
||||
if not file.filename:
|
||||
raise ValueError("Filename is required")
|
||||
|
||||
# Check for malicious filenames
|
||||
if ".." in file.filename or "/" in file.filename or "\\" in file.filename:
|
||||
raise ValueError("Invalid filename")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import uvicorn
|
||||
|
||||
uvicorn.run(
|
||||
"main:app",
|
||||
host="0.0.0.0",
|
||||
port=8000,
|
||||
reload=True,
|
||||
log_config=None, # Use structlog configuration
|
||||
)
|
||||
9
apps/svc_ingestion/requirements.txt
Normal file
9
apps/svc_ingestion/requirements.txt
Normal file
@@ -0,0 +1,9 @@
|
||||
# Service-specific dependencies for svc_ingestion
|
||||
# File upload and processing
|
||||
aiofiles>=23.2.0
|
||||
|
||||
# MIME type detection
|
||||
python-magic>=0.4.27
|
||||
|
||||
# Image processing (for thumbnails) - lightweight
|
||||
Pillow>=10.1.0
|
||||
54
apps/svc_kg/Dockerfile
Normal file
54
apps/svc_kg/Dockerfile
Normal file
@@ -0,0 +1,54 @@
|
||||
# Multi-stage build for svc_kg
|
||||
FROM python:3.12-slim AS builder
|
||||
|
||||
# Install build dependencies
|
||||
RUN apt-get update && apt-get install -y \
|
||||
build-essential \
|
||||
curl \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Create virtual environment
|
||||
RUN python -m venv /opt/venv
|
||||
ENV PATH="/opt/venv/bin:$PATH"
|
||||
|
||||
# Copy requirements and install dependencies
|
||||
COPY libs/requirements-base.txt /tmp/libs-requirements.txt
|
||||
COPY libs/requirements-rdf.txt /tmp/libs-rdf.txt
|
||||
COPY apps/svc_kg/requirements.txt /tmp/requirements.txt
|
||||
RUN pip install --no-cache-dir --upgrade pip && \
|
||||
pip install --no-cache-dir -r /tmp/libs-requirements.txt -r /tmp/libs-rdf.txt -r /tmp/requirements.txt
|
||||
|
||||
# Production stage
|
||||
FROM python:3.12-slim
|
||||
|
||||
# Install runtime dependencies
|
||||
RUN apt-get update && apt-get install -y \
|
||||
curl \
|
||||
&& rm -rf /var/lib/apt/lists/* \
|
||||
&& groupadd -r appuser \
|
||||
&& useradd -r -g appuser appuser
|
||||
|
||||
# Copy virtual environment from builder
|
||||
COPY --from=builder /opt/venv /opt/venv
|
||||
ENV PATH="/opt/venv/bin:$PATH"
|
||||
|
||||
# Set working directory
|
||||
WORKDIR /app
|
||||
|
||||
# Copy application code
|
||||
COPY libs/ ./libs/
|
||||
COPY apps/svc_kg/ ./apps/svc_kg/
|
||||
|
||||
# Create non-root user and set permissions
|
||||
RUN chown -R appuser:appuser /app
|
||||
USER appuser
|
||||
|
||||
# Health check
|
||||
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
|
||||
CMD curl -f http://localhost:8000/healthz || exit 1
|
||||
|
||||
# Expose port
|
||||
EXPOSE 8000
|
||||
|
||||
# Run the application
|
||||
CMD ["python", "-m", "uvicorn", "apps.svc_kg.main:app", "--host", "0.0.0.0", "--port", "8000"]
|
||||
572
apps/svc_kg/main.py
Normal file
572
apps/svc_kg/main.py
Normal file
@@ -0,0 +1,572 @@
|
||||
# FILE: apps/svc-kg/main.py
|
||||
|
||||
# Knowledge graph facade with CRUD, queries, lineage, and SHACL validation
|
||||
|
||||
import json
|
||||
import os
|
||||
|
||||
# Import shared libraries
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from typing import Any
|
||||
|
||||
import structlog
|
||||
from fastapi import Depends, HTTPException, Query, Request
|
||||
from fastapi.responses import JSONResponse
|
||||
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
|
||||
|
||||
from libs.app_factory import create_app
|
||||
from libs.config import BaseAppSettings, create_event_bus, create_neo4j_client
|
||||
from libs.events import EventBus
|
||||
from libs.neo import Neo4jClient, SHACLValidator, TemporalQueries
|
||||
from libs.observability import get_metrics, get_tracer, setup_observability
|
||||
from libs.schemas import ErrorResponse
|
||||
from libs.security import get_current_user, get_tenant_id
|
||||
|
||||
logger = structlog.get_logger()
|
||||
|
||||
|
||||
class KGSettings(BaseAppSettings):
|
||||
"""Settings for KG service"""
|
||||
|
||||
service_name: str = "svc-kg"
|
||||
|
||||
# SHACL validation
|
||||
shapes_file: str = "schemas/shapes.ttl"
|
||||
validate_on_write: bool = True
|
||||
|
||||
# Query limits
|
||||
max_results: int = 1000
|
||||
max_depth: int = 10
|
||||
query_timeout: int = 30
|
||||
|
||||
|
||||
# Create app and settings
|
||||
app, settings = create_app(
|
||||
service_name="svc-kg",
|
||||
title="Tax Agent Knowledge Graph Service",
|
||||
description="Knowledge graph facade with CRUD and queries",
|
||||
settings_class=KGSettings,
|
||||
)
|
||||
|
||||
# Global clients
|
||||
neo4j_client: Neo4jClient | None = None
|
||||
shacl_validator: SHACLValidator | None = None
|
||||
event_bus: EventBus | None = None
|
||||
tracer = get_tracer("svc-kg")
|
||||
metrics = get_metrics()
|
||||
|
||||
|
||||
@app.on_event("startup")
|
||||
async def startup_event() -> None:
|
||||
"""Initialize service dependencies"""
|
||||
global neo4j_client, shacl_validator, event_bus
|
||||
|
||||
logger.info("Starting KG service")
|
||||
|
||||
# Setup observability
|
||||
setup_observability(settings)
|
||||
|
||||
# Initialize Neo4j client
|
||||
neo4j_driver = create_neo4j_client(settings)
|
||||
neo4j_client = Neo4jClient(neo4j_driver)
|
||||
|
||||
# Initialize SHACL validator
|
||||
if os.path.exists(settings.shapes_file):
|
||||
shacl_validator = SHACLValidator(settings.shapes_file)
|
||||
|
||||
# Initialize event bus
|
||||
event_bus = create_event_bus(settings)
|
||||
await event_bus.start()
|
||||
|
||||
logger.info("KG service started successfully")
|
||||
|
||||
|
||||
@app.on_event("shutdown")
|
||||
async def shutdown_event() -> None:
|
||||
"""Cleanup service dependencies"""
|
||||
global neo4j_client, event_bus
|
||||
|
||||
logger.info("Shutting down KG service")
|
||||
|
||||
if neo4j_client:
|
||||
await neo4j_client.close()
|
||||
|
||||
if event_bus:
|
||||
await event_bus.stop()
|
||||
|
||||
logger.info("KG service shutdown complete")
|
||||
|
||||
|
||||
@app.get("/health")
|
||||
async def health_check() -> dict[str, Any]:
|
||||
"""Health check endpoint"""
|
||||
return {
|
||||
"status": "healthy",
|
||||
"service": settings.service_name,
|
||||
"version": settings.service_version,
|
||||
"timestamp": datetime.utcnow().isoformat(),
|
||||
}
|
||||
|
||||
|
||||
@app.post("/nodes/{label}")
|
||||
async def create_node(
|
||||
label: str,
|
||||
properties: dict[str, Any],
|
||||
current_user: dict[str, Any] = Depends(get_current_user),
|
||||
tenant_id: str = Depends(get_tenant_id),
|
||||
) -> dict[str, Any]:
|
||||
"""Create a new node"""
|
||||
|
||||
with tracer.start_as_current_span("create_node") as span:
|
||||
span.set_attribute("label", label)
|
||||
span.set_attribute("tenant_id", tenant_id)
|
||||
|
||||
try:
|
||||
# Add tenant isolation
|
||||
properties["tenant_id"] = tenant_id
|
||||
properties["created_by"] = current_user.get("sub", "system")
|
||||
|
||||
# Validate with SHACL if enabled
|
||||
if settings.validate_on_write and shacl_validator:
|
||||
await _validate_node(label, properties)
|
||||
|
||||
# Create node
|
||||
result = await neo4j_client.create_node(label, properties)
|
||||
|
||||
# Update metrics
|
||||
metrics.counter("nodes_created_total").labels(
|
||||
tenant_id=tenant_id, label=label
|
||||
).inc()
|
||||
|
||||
logger.info("Node created", label=label, node_id=result.get("id"))
|
||||
|
||||
return {
|
||||
"status": "created",
|
||||
"label": label,
|
||||
"properties": properties,
|
||||
"neo4j_result": result,
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Failed to create node", label=label, error=str(e))
|
||||
raise HTTPException(
|
||||
status_code=500, detail=f"Failed to create node: {str(e)}"
|
||||
)
|
||||
|
||||
|
||||
@app.get("/nodes/{label}")
|
||||
async def get_nodes(
|
||||
label: str,
|
||||
limit: int = Query(default=100, le=settings.max_results),
|
||||
filters: str | None = Query(default=None),
|
||||
current_user: dict[str, Any] = Depends(get_current_user),
|
||||
tenant_id: str = Depends(get_tenant_id),
|
||||
) -> dict[str, Any]:
|
||||
"""Get nodes by label with optional filters"""
|
||||
|
||||
with tracer.start_as_current_span("get_nodes") as span:
|
||||
span.set_attribute("label", label)
|
||||
span.set_attribute("tenant_id", tenant_id)
|
||||
span.set_attribute("limit", limit)
|
||||
|
||||
try:
|
||||
# Parse filters
|
||||
filter_dict: dict[str, Any] = {}
|
||||
if filters:
|
||||
try:
|
||||
filter_dict = json.loads(filters)
|
||||
except json.JSONDecodeError:
|
||||
raise HTTPException(status_code=400, detail="Invalid filters JSON")
|
||||
|
||||
# Add tenant isolation
|
||||
filter_dict["tenant_id"] = tenant_id
|
||||
|
||||
# Build query
|
||||
query = TemporalQueries.get_current_state_query(label, filter_dict)
|
||||
query += f" LIMIT {limit}"
|
||||
|
||||
# Execute query
|
||||
results = await neo4j_client.run_query(query)
|
||||
|
||||
# Update metrics
|
||||
metrics.counter("nodes_queried_total").labels(
|
||||
tenant_id=tenant_id, label=label
|
||||
).inc()
|
||||
|
||||
return {
|
||||
"label": label,
|
||||
"count": len(results),
|
||||
"nodes": [result["n"] for result in results],
|
||||
}
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error("Failed to get nodes", label=label, error=str(e))
|
||||
raise HTTPException(
|
||||
status_code=500, detail=f"Failed to get nodes: {str(e)}"
|
||||
)
|
||||
|
||||
|
||||
@app.get("/nodes/{label}/{node_id}")
|
||||
async def get_node(
|
||||
label: str,
|
||||
node_id: str,
|
||||
include_lineage: bool = Query(default=False),
|
||||
current_user: dict[str, Any] = Depends(get_current_user),
|
||||
tenant_id: str = Depends(get_tenant_id),
|
||||
) -> dict[str, Any]:
|
||||
"""Get specific node with optional lineage"""
|
||||
|
||||
with tracer.start_as_current_span("get_node") as span:
|
||||
span.set_attribute("label", label)
|
||||
span.set_attribute("node_id", node_id)
|
||||
span.set_attribute("tenant_id", tenant_id)
|
||||
|
||||
try:
|
||||
# Get node
|
||||
query = f"""
|
||||
MATCH (n:{label} {{id: $node_id, tenant_id: $tenant_id}})
|
||||
WHERE n.retracted_at IS NULL
|
||||
RETURN n
|
||||
"""
|
||||
|
||||
results = await neo4j_client.run_query(
|
||||
query, {"node_id": node_id, "tenant_id": tenant_id}
|
||||
)
|
||||
|
||||
if not results:
|
||||
raise HTTPException(status_code=404, detail="Node not found")
|
||||
|
||||
node_data = results[0]["n"]
|
||||
|
||||
# Get lineage if requested
|
||||
lineage: list[dict[str, Any]] = []
|
||||
if include_lineage:
|
||||
lineage = await neo4j_client.get_node_lineage(node_id)
|
||||
|
||||
return {"node": node_data, "lineage": lineage if include_lineage else None}
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
"Failed to get node", label=label, node_id=node_id, error=str(e)
|
||||
)
|
||||
raise HTTPException(status_code=500, detail=f"Failed to get node: {str(e)}")
|
||||
|
||||
|
||||
@app.put("/nodes/{label}/{node_id}")
|
||||
async def update_node(
|
||||
label: str,
|
||||
node_id: str,
|
||||
properties: dict[str, Any],
|
||||
current_user: dict[str, Any] = Depends(get_current_user),
|
||||
tenant_id: str = Depends(get_tenant_id),
|
||||
) -> dict[str, Any]:
|
||||
"""Update node with bitemporal versioning"""
|
||||
|
||||
with tracer.start_as_current_span("update_node") as span:
|
||||
span.set_attribute("label", label)
|
||||
span.set_attribute("node_id", node_id)
|
||||
span.set_attribute("tenant_id", tenant_id)
|
||||
|
||||
try:
|
||||
# Add metadata
|
||||
properties["tenant_id"] = tenant_id
|
||||
properties["updated_by"] = current_user.get("sub", "system")
|
||||
|
||||
# Validate with SHACL if enabled
|
||||
if settings.validate_on_write and shacl_validator:
|
||||
await _validate_node(label, properties)
|
||||
|
||||
# Update node (creates new version)
|
||||
await neo4j_client.update_node(label, node_id, properties)
|
||||
|
||||
# Update metrics
|
||||
metrics.counter("nodes_updated_total").labels(
|
||||
tenant_id=tenant_id, label=label
|
||||
).inc()
|
||||
|
||||
logger.info("Node updated", label=label, node_id=node_id)
|
||||
|
||||
return {
|
||||
"status": "updated",
|
||||
"label": label,
|
||||
"node_id": node_id,
|
||||
"properties": properties,
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
"Failed to update node", label=label, node_id=node_id, error=str(e)
|
||||
)
|
||||
raise HTTPException(
|
||||
status_code=500, detail=f"Failed to update node: {str(e)}"
|
||||
)
|
||||
|
||||
|
||||
@app.post("/relationships")
|
||||
async def create_relationship(
|
||||
from_label: str,
|
||||
from_id: str,
|
||||
to_label: str,
|
||||
to_id: str,
|
||||
relationship_type: str,
|
||||
properties: dict[str, Any] | None = None,
|
||||
current_user: dict[str, Any] = Depends(get_current_user),
|
||||
tenant_id: str = Depends(get_tenant_id),
|
||||
) -> dict[str, Any]:
|
||||
"""Create relationship between nodes"""
|
||||
|
||||
with tracer.start_as_current_span("create_relationship") as span:
|
||||
span.set_attribute("from_label", from_label)
|
||||
span.set_attribute("to_label", to_label)
|
||||
span.set_attribute("relationship_type", relationship_type)
|
||||
span.set_attribute("tenant_id", tenant_id)
|
||||
|
||||
try:
|
||||
# Add metadata
|
||||
rel_properties = properties or {}
|
||||
rel_properties["tenant_id"] = tenant_id
|
||||
rel_properties["created_by"] = current_user.get("sub", "system")
|
||||
|
||||
# Create relationship
|
||||
await neo4j_client.create_relationship(
|
||||
from_label, from_id, to_label, to_id, relationship_type, rel_properties
|
||||
)
|
||||
|
||||
# Update metrics
|
||||
metrics.counter("relationships_created_total").labels(
|
||||
tenant_id=tenant_id, relationship_type=relationship_type
|
||||
).inc()
|
||||
|
||||
logger.info(
|
||||
"Relationship created",
|
||||
from_id=from_id,
|
||||
to_id=to_id,
|
||||
type=relationship_type,
|
||||
)
|
||||
|
||||
return {
|
||||
"status": "created",
|
||||
"from_id": from_id,
|
||||
"to_id": to_id,
|
||||
"relationship_type": relationship_type,
|
||||
"properties": rel_properties,
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Failed to create relationship", error=str(e))
|
||||
raise HTTPException(
|
||||
status_code=500, detail=f"Failed to create relationship: {str(e)}"
|
||||
)
|
||||
|
||||
|
||||
@app.post("/query")
|
||||
async def execute_query(
|
||||
query: str,
|
||||
parameters: dict[str, Any] | None = None,
|
||||
current_user: dict[str, Any] = Depends(get_current_user),
|
||||
tenant_id: str = Depends(get_tenant_id),
|
||||
) -> dict[str, Any]:
|
||||
"""Execute custom Cypher query with tenant isolation"""
|
||||
|
||||
with tracer.start_as_current_span("execute_query") as span:
|
||||
span.set_attribute("tenant_id", tenant_id)
|
||||
|
||||
try:
|
||||
# Add tenant isolation to parameters
|
||||
query_params = parameters or {}
|
||||
query_params["tenant_id"] = tenant_id
|
||||
|
||||
# Validate query (basic security check)
|
||||
if not _is_safe_query(query):
|
||||
raise HTTPException(status_code=400, detail="Unsafe query detected")
|
||||
|
||||
# Execute query with timeout
|
||||
results = await neo4j_client.run_query(query, query_params, max_retries=1)
|
||||
|
||||
# Update metrics
|
||||
metrics.counter("custom_queries_total").labels(tenant_id=tenant_id).inc()
|
||||
|
||||
return {
|
||||
"query": query,
|
||||
"parameters": query_params,
|
||||
"results": results,
|
||||
"count": len(results),
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Query execution failed", query=query[:100], error=str(e))
|
||||
raise HTTPException(status_code=500, detail=f"Query failed: {str(e)}")
|
||||
|
||||
|
||||
@app.get("/export/rdf")
|
||||
async def export_rdf(
|
||||
format: str = Query(default="turtle"),
|
||||
current_user: dict[str, Any] = Depends(get_current_user),
|
||||
tenant_id: str = Depends(get_tenant_id),
|
||||
) -> dict[str, Any]:
|
||||
"""Export knowledge graph as RDF"""
|
||||
|
||||
with tracer.start_as_current_span("export_rdf") as span:
|
||||
span.set_attribute("format", format)
|
||||
span.set_attribute("tenant_id", tenant_id)
|
||||
|
||||
try:
|
||||
# Export tenant-specific data
|
||||
rdf_data = await neo4j_client.export_to_rdf(format)
|
||||
|
||||
# Update metrics
|
||||
metrics.counter("rdf_exports_total").labels(
|
||||
tenant_id=tenant_id, format=format
|
||||
).inc()
|
||||
|
||||
return {
|
||||
"format": format,
|
||||
"rdf_data": rdf_data,
|
||||
"exported_at": datetime.utcnow().isoformat(),
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error("RDF export failed", format=format, error=str(e))
|
||||
raise HTTPException(
|
||||
status_code=500, detail=f"RDF export failed: {str(e)}"
|
||||
) from e
|
||||
|
||||
|
||||
@app.post("/validate")
|
||||
async def validate_graph(
|
||||
current_user: dict[str, Any] = Depends(get_current_user),
|
||||
tenant_id: str = Depends(get_tenant_id),
|
||||
) -> dict[str, Any]:
|
||||
"""Validate knowledge graph with SHACL"""
|
||||
|
||||
with tracer.start_as_current_span("validate_graph") as span:
|
||||
span.set_attribute("tenant_id", tenant_id)
|
||||
|
||||
try:
|
||||
if not shacl_validator:
|
||||
raise HTTPException(
|
||||
status_code=501, detail="SHACL validation not configured"
|
||||
)
|
||||
|
||||
# Export current graph state
|
||||
rdf_export = await neo4j_client.export_to_rdf("turtle")
|
||||
|
||||
# Extract RDF data from export result
|
||||
rdf_data = rdf_export.get("rdf_data", "")
|
||||
if not rdf_data:
|
||||
raise HTTPException(
|
||||
status_code=500, detail="Failed to export RDF data for validation"
|
||||
)
|
||||
|
||||
# Run SHACL validation
|
||||
validation_result = await shacl_validator.validate_graph(rdf_data)
|
||||
|
||||
# Update metrics
|
||||
metrics.counter("validations_total").labels(
|
||||
tenant_id=tenant_id, conforms=validation_result["conforms"]
|
||||
).inc()
|
||||
|
||||
return {
|
||||
"conforms": validation_result["conforms"],
|
||||
"violations_count": validation_result["violations_count"],
|
||||
"results_text": validation_result["results_text"],
|
||||
"validated_at": datetime.utcnow().isoformat(),
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Graph validation failed", error=str(e))
|
||||
raise HTTPException(status_code=500, detail=f"Validation failed: {str(e)}")
|
||||
|
||||
|
||||
async def _validate_node(label: str, properties: dict[str, Any]) -> bool:
|
||||
"""Validate node with SHACL"""
|
||||
if not shacl_validator:
|
||||
return True
|
||||
|
||||
try:
|
||||
# Create a minimal RDF representation of the node for validation
|
||||
rdf_lines = ["@prefix tax: <https://tax-kg.example.com/> ."]
|
||||
node_uri = "tax:temp_node"
|
||||
|
||||
# Add type declaration
|
||||
rdf_lines.append(f"{node_uri} a tax:{label} .")
|
||||
|
||||
# Add properties
|
||||
for prop, value in properties.items():
|
||||
if isinstance(value, str):
|
||||
rdf_lines.append(f'{node_uri} tax:{prop} "{value}" .')
|
||||
else:
|
||||
rdf_lines.append(f"{node_uri} tax:{prop} {value} .")
|
||||
|
||||
rdf_data = "\n".join(rdf_lines)
|
||||
|
||||
# Validate the node RDF data
|
||||
validation_result = await shacl_validator.validate_graph(rdf_data)
|
||||
|
||||
if not validation_result["conforms"]:
|
||||
logger.warning(
|
||||
"Node SHACL validation failed",
|
||||
label=label,
|
||||
violations=validation_result["violations_count"],
|
||||
details=validation_result["results_text"],
|
||||
)
|
||||
return False
|
||||
|
||||
logger.debug("Node SHACL validation passed", label=label)
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Node SHACL validation error", label=label, error=str(e))
|
||||
# Return True to not block operations on validation errors
|
||||
return True
|
||||
|
||||
|
||||
def _is_safe_query(query: str) -> bool:
|
||||
"""Basic query safety check"""
|
||||
query_lower = query.lower()
|
||||
|
||||
# Block dangerous operations
|
||||
dangerous_keywords = [
|
||||
"delete",
|
||||
"remove",
|
||||
"drop",
|
||||
"create index",
|
||||
"create constraint",
|
||||
"load csv",
|
||||
"call",
|
||||
"foreach",
|
||||
]
|
||||
|
||||
for keyword in dangerous_keywords:
|
||||
if keyword in query_lower:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
@app.exception_handler(HTTPException)
|
||||
async def http_exception_handler(request: Request, exc: HTTPException) -> JSONResponse:
|
||||
"""Handle HTTP exceptions with RFC7807 format"""
|
||||
return JSONResponse(
|
||||
status_code=exc.status_code,
|
||||
content=ErrorResponse(
|
||||
type=f"https://httpstatuses.com/{exc.status_code}",
|
||||
title=exc.detail,
|
||||
status=exc.status_code,
|
||||
detail=exc.detail,
|
||||
instance=str(request.url),
|
||||
trace_id="",
|
||||
).model_dump(),
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import uvicorn
|
||||
|
||||
uvicorn.run("main:app", host="0.0.0.0", port=8005, reload=True, log_config=None)
|
||||
22
apps/svc_kg/requirements.txt
Normal file
22
apps/svc_kg/requirements.txt
Normal file
@@ -0,0 +1,22 @@
|
||||
# Service-specific dependencies
|
||||
# RDF and semantic web
|
||||
rdflib>=7.0.0
|
||||
pyshacl>=0.25.0
|
||||
|
||||
# Graph algorithms
|
||||
networkx>=3.2.0
|
||||
|
||||
# Data export formats
|
||||
xmltodict>=0.13.0
|
||||
|
||||
# Query optimization
|
||||
pyparsing>=3.1.0
|
||||
|
||||
# Graph visualization (optional)
|
||||
graphviz>=0.20.0
|
||||
|
||||
# Additional Neo4j utilities
|
||||
neomodel>=5.2.0
|
||||
|
||||
# Cypher query building
|
||||
py2neo>=2021.2.4
|
||||
53
apps/svc_normalize_map/Dockerfile
Normal file
53
apps/svc_normalize_map/Dockerfile
Normal file
@@ -0,0 +1,53 @@
|
||||
# Multi-stage build for svc_normalize_map
|
||||
FROM python:3.12-slim AS builder
|
||||
|
||||
# Install build dependencies
|
||||
RUN apt-get update && apt-get install -y \
|
||||
build-essential \
|
||||
curl \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Create virtual environment
|
||||
RUN python -m venv /opt/venv
|
||||
ENV PATH="/opt/venv/bin:$PATH"
|
||||
|
||||
# Copy requirements and install dependencies
|
||||
COPY libs/requirements-base.txt /tmp/libs-requirements.txt
|
||||
COPY apps/svc_normalize_map/requirements.txt /tmp/requirements.txt
|
||||
RUN pip install --no-cache-dir --upgrade pip && \
|
||||
pip install --no-cache-dir -r /tmp/libs-requirements.txt -r /tmp/requirements.txt
|
||||
|
||||
# Production stage
|
||||
FROM python:3.12-slim
|
||||
|
||||
# Install runtime dependencies
|
||||
RUN apt-get update && apt-get install -y \
|
||||
curl \
|
||||
&& rm -rf /var/lib/apt/lists/* \
|
||||
&& groupadd -r appuser \
|
||||
&& useradd -r -g appuser appuser
|
||||
|
||||
# Copy virtual environment from builder
|
||||
COPY --from=builder /opt/venv /opt/venv
|
||||
ENV PATH="/opt/venv/bin:$PATH"
|
||||
|
||||
# Set working directory
|
||||
WORKDIR /app
|
||||
|
||||
# Copy application code
|
||||
COPY libs/ ./libs/
|
||||
COPY apps/svc_normalize_map/ ./apps/svc_normalize_map/
|
||||
|
||||
# Create non-root user and set permissions
|
||||
RUN chown -R appuser:appuser /app
|
||||
USER appuser
|
||||
|
||||
# Health check
|
||||
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
|
||||
CMD curl -f http://localhost:8000/healthz || exit 1
|
||||
|
||||
# Expose port
|
||||
EXPOSE 8000
|
||||
|
||||
# Run the application
|
||||
CMD ["python", "-m", "uvicorn", "apps.svc_normalize_map.main:app", "--host", "0.0.0.0", "--port", "8000"]
|
||||
590
apps/svc_normalize_map/main.py
Normal file
590
apps/svc_normalize_map/main.py
Normal file
@@ -0,0 +1,590 @@
|
||||
"""Data normalization and knowledge graph mapping."""
|
||||
|
||||
# FILE: apps/svc-normalize-map/main.py
|
||||
# pylint: disable=wrong-import-position,import-error,too-few-public-methods,global-statement
|
||||
# pylint: disable=global-variable-not-assigned,raise-missing-from,unused-argument
|
||||
# pylint: disable=broad-exception-caught,no-else-return,too-many-arguments,too-many-positional-arguments
|
||||
# pylint: disable=too-many-locals,import-outside-toplevel,too-many-statements
|
||||
# mypy: disable-error-code=union-attr
|
||||
|
||||
|
||||
import os
|
||||
|
||||
# Import shared libraries
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from decimal import Decimal
|
||||
from typing import Any
|
||||
|
||||
import structlog
|
||||
import ulid
|
||||
from fastapi import BackgroundTasks, Depends, HTTPException, Request
|
||||
from fastapi.responses import JSONResponse
|
||||
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
|
||||
|
||||
from libs.app_factory import create_app
|
||||
from libs.config import (
|
||||
BaseAppSettings,
|
||||
create_event_bus,
|
||||
create_minio_client,
|
||||
create_neo4j_client,
|
||||
)
|
||||
from libs.events import EventBus, EventPayload, EventTopics
|
||||
from libs.neo import Neo4jClient
|
||||
from libs.observability import get_metrics, get_tracer, setup_observability
|
||||
from libs.schemas import ErrorResponse
|
||||
from libs.security import get_current_user, get_tenant_id
|
||||
from libs.storage import DocumentStorage, StorageClient
|
||||
|
||||
logger = structlog.get_logger()
|
||||
|
||||
|
||||
class NormalizeMapSettings(BaseAppSettings):
|
||||
"""Settings for normalize-map service"""
|
||||
|
||||
service_name: str = "svc-normalize-map"
|
||||
|
||||
# Normalization configuration
|
||||
currency_default: str = "GBP"
|
||||
date_formats: list[str] = [
|
||||
"%Y-%m-%d",
|
||||
"%d/%m/%Y",
|
||||
"%d-%m-%Y",
|
||||
"%d %B %Y",
|
||||
"%d %b %Y",
|
||||
"%B %d, %Y",
|
||||
]
|
||||
|
||||
# Mapping configuration
|
||||
confidence_threshold: float = 0.7
|
||||
auto_create_entities: bool = True
|
||||
|
||||
# Validation rules
|
||||
max_amount: float = 1000000.0 # £1M
|
||||
min_confidence: float = 0.5
|
||||
|
||||
|
||||
# Create app and settings
|
||||
app, settings = create_app(
|
||||
service_name="svc-normalize-map",
|
||||
title="Tax Agent Normalize-Map Service",
|
||||
description="Data normalization and knowledge graph mapping service",
|
||||
settings_class=NormalizeMapSettings,
|
||||
)
|
||||
|
||||
# Global clients
|
||||
storage_client: StorageClient | None = None
|
||||
document_storage: DocumentStorage | None = None
|
||||
neo4j_client: Neo4jClient | None = None
|
||||
event_bus: EventBus | None = None
|
||||
tracer = get_tracer("svc-normalize-map")
|
||||
metrics = get_metrics()
|
||||
|
||||
|
||||
@app.on_event("startup")
|
||||
async def startup_event() -> None:
|
||||
"""Initialize service dependencies"""
|
||||
global storage_client, document_storage, neo4j_client, event_bus
|
||||
|
||||
logger.info("Starting normalize-map service")
|
||||
|
||||
# Setup observability
|
||||
setup_observability(settings)
|
||||
|
||||
# Initialize MinIO client
|
||||
minio_client = create_minio_client(settings)
|
||||
storage_client = StorageClient(minio_client)
|
||||
document_storage = DocumentStorage(storage_client)
|
||||
|
||||
# Initialize Neo4j client
|
||||
neo4j_driver = create_neo4j_client(settings)
|
||||
neo4j_client = Neo4jClient(neo4j_driver)
|
||||
|
||||
# Initialize event bus
|
||||
event_bus = create_event_bus(settings)
|
||||
await event_bus.start()
|
||||
|
||||
# Subscribe to extraction completion events
|
||||
await event_bus.subscribe( # type: ignore
|
||||
EventTopics.DOC_EXTRACTED, _handle_extraction_completed
|
||||
)
|
||||
|
||||
logger.info("Normalize-map service started successfully")
|
||||
|
||||
|
||||
@app.on_event("shutdown")
|
||||
async def shutdown_event() -> None:
|
||||
"""Cleanup service dependencies"""
|
||||
global event_bus, neo4j_client
|
||||
|
||||
logger.info("Shutting down normalize-map service")
|
||||
|
||||
if neo4j_client:
|
||||
await neo4j_client.close()
|
||||
|
||||
if event_bus:
|
||||
await event_bus.stop()
|
||||
|
||||
logger.info("Normalize-map service shutdown complete")
|
||||
|
||||
|
||||
@app.get("/health")
|
||||
async def health_check() -> dict[str, Any]:
|
||||
"""Health check endpoint"""
|
||||
return {
|
||||
"status": "healthy",
|
||||
"service": settings.service_name,
|
||||
"version": settings.service_version,
|
||||
"timestamp": datetime.utcnow().isoformat(),
|
||||
}
|
||||
|
||||
|
||||
@app.post("/normalize/{doc_id}")
|
||||
async def normalize_document(
|
||||
doc_id: str,
|
||||
background_tasks: BackgroundTasks,
|
||||
current_user: dict[str, Any] = Depends(get_current_user),
|
||||
tenant_id: str = Depends(get_tenant_id),
|
||||
) -> dict[str, Any]:
|
||||
"""Normalize and map document data to knowledge graph"""
|
||||
|
||||
with tracer.start_as_current_span("normalize_document") as span:
|
||||
span.set_attribute("doc_id", doc_id)
|
||||
span.set_attribute("tenant_id", tenant_id)
|
||||
|
||||
try:
|
||||
# Check if extraction results exist
|
||||
extraction_results = await document_storage.get_extraction_result(
|
||||
tenant_id, doc_id
|
||||
)
|
||||
if not extraction_results:
|
||||
raise HTTPException(
|
||||
status_code=404, detail="Extraction results not found"
|
||||
)
|
||||
|
||||
# Generate normalization ID
|
||||
normalization_id = str(ulid.new())
|
||||
span.set_attribute("normalization_id", normalization_id)
|
||||
|
||||
# Start background normalization
|
||||
background_tasks.add_task(
|
||||
_normalize_and_map_async,
|
||||
doc_id,
|
||||
tenant_id,
|
||||
extraction_results,
|
||||
normalization_id,
|
||||
current_user.get("sub", "system"),
|
||||
)
|
||||
|
||||
logger.info(
|
||||
"Normalization started",
|
||||
doc_id=doc_id,
|
||||
normalization_id=normalization_id,
|
||||
)
|
||||
|
||||
return {
|
||||
"normalization_id": normalization_id,
|
||||
"doc_id": doc_id,
|
||||
"status": "processing",
|
||||
}
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error("Failed to start normalization", doc_id=doc_id, error=str(e))
|
||||
raise HTTPException(status_code=500, detail="Failed to start normalization")
|
||||
|
||||
|
||||
async def _handle_extraction_completed(topic: str, payload: EventPayload) -> None:
|
||||
"""Handle extraction completion events"""
|
||||
try:
|
||||
data = payload.data
|
||||
doc_id = data.get("doc_id")
|
||||
tenant_id = data.get("tenant_id")
|
||||
confidence = data.get("confidence", 0.0)
|
||||
|
||||
if not doc_id or not tenant_id:
|
||||
logger.warning("Invalid extraction completion event", data=data)
|
||||
return
|
||||
|
||||
# Only auto-process if confidence is above threshold
|
||||
if confidence >= settings.confidence_threshold:
|
||||
logger.info(
|
||||
"Auto-normalizing extracted document",
|
||||
doc_id=doc_id,
|
||||
confidence=confidence,
|
||||
)
|
||||
|
||||
extraction_results = data.get("extraction_results")
|
||||
if not extraction_results:
|
||||
extraction_results = await document_storage.get_extraction_result(
|
||||
tenant_id, doc_id
|
||||
)
|
||||
|
||||
if extraction_results:
|
||||
await _normalize_and_map_async(
|
||||
doc_id=doc_id,
|
||||
tenant_id=tenant_id,
|
||||
extraction_results=extraction_results,
|
||||
normalization_id=str(ulid.new()),
|
||||
actor=payload.actor,
|
||||
)
|
||||
else:
|
||||
logger.info(
|
||||
"Skipping auto-normalization due to low confidence",
|
||||
doc_id=doc_id,
|
||||
confidence=confidence,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Failed to handle extraction completion", error=str(e))
|
||||
|
||||
|
||||
async def _normalize_and_map_async(
|
||||
doc_id: str,
|
||||
tenant_id: str,
|
||||
extraction_results: dict[str, Any],
|
||||
normalization_id: str,
|
||||
actor: str,
|
||||
) -> None:
|
||||
"""Normalize and map data asynchronously"""
|
||||
|
||||
with tracer.start_as_current_span("normalize_and_map_async") as span:
|
||||
span.set_attribute("doc_id", doc_id)
|
||||
span.set_attribute("normalization_id", normalization_id)
|
||||
|
||||
try:
|
||||
extracted_fields = extraction_results.get("extracted_fields", {})
|
||||
provenance = extraction_results.get("provenance", [])
|
||||
|
||||
# Normalize extracted data
|
||||
normalized_data = await _normalize_data(extracted_fields, provenance)
|
||||
|
||||
# Map to knowledge graph entities
|
||||
entities = await _map_to_entities(normalized_data, doc_id, tenant_id)
|
||||
|
||||
# Store entities in knowledge graph
|
||||
stored_entities = await _store_entities(entities, tenant_id)
|
||||
|
||||
# Create normalization results
|
||||
normalization_results = {
|
||||
"doc_id": doc_id,
|
||||
"normalization_id": normalization_id,
|
||||
"normalized_at": datetime.utcnow().isoformat(),
|
||||
"normalized_data": normalized_data,
|
||||
"entities": stored_entities,
|
||||
"entity_count": len(stored_entities),
|
||||
}
|
||||
|
||||
logger.info("Normalization completed", results=normalization_results)
|
||||
|
||||
# Update metrics
|
||||
metrics.counter("documents_normalized_total").labels(
|
||||
tenant_id=tenant_id
|
||||
).inc()
|
||||
|
||||
metrics.histogram("entities_created").labels(tenant_id=tenant_id).observe(
|
||||
len(stored_entities)
|
||||
)
|
||||
|
||||
# Publish completion event
|
||||
event_payload = EventPayload(
|
||||
data={
|
||||
"doc_id": doc_id,
|
||||
"tenant_id": tenant_id,
|
||||
"normalization_id": normalization_id,
|
||||
"entity_count": len(stored_entities),
|
||||
"entities": stored_entities,
|
||||
},
|
||||
actor=actor,
|
||||
tenant_id=tenant_id,
|
||||
)
|
||||
|
||||
await event_bus.publish(EventTopics.KG_UPSERTED, event_payload)
|
||||
|
||||
logger.info(
|
||||
"Normalization completed", doc_id=doc_id, entities=len(stored_entities)
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Normalization failed", doc_id=doc_id, error=str(e))
|
||||
|
||||
# Update error metrics
|
||||
metrics.counter("normalization_errors_total").labels(
|
||||
tenant_id=tenant_id, error_type=type(e).__name__
|
||||
).inc()
|
||||
|
||||
|
||||
async def _normalize_data(
|
||||
extracted_fields: dict[str, Any], provenance: list[dict[str, Any]]
|
||||
) -> dict[str, Any]:
|
||||
"""Normalize extracted data"""
|
||||
|
||||
normalized = {}
|
||||
|
||||
for field_name, raw_value in extracted_fields.items():
|
||||
try:
|
||||
if "amount" in field_name.lower() or "total" in field_name.lower():
|
||||
normalized[field_name] = _normalize_amount(raw_value)
|
||||
elif "date" in field_name.lower():
|
||||
normalized[field_name] = _normalize_date(raw_value)
|
||||
elif "name" in field_name.lower():
|
||||
normalized[field_name] = _normalize_name(raw_value)
|
||||
elif "address" in field_name.lower():
|
||||
normalized[field_name] = _normalize_address(raw_value)
|
||||
elif "number" in field_name.lower():
|
||||
normalized[field_name] = _normalize_number(raw_value)
|
||||
else:
|
||||
normalized[field_name] = _normalize_text(raw_value)
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
"Failed to normalize field",
|
||||
field=field_name,
|
||||
value=raw_value,
|
||||
error=str(e),
|
||||
)
|
||||
normalized[field_name] = raw_value # Keep original value
|
||||
|
||||
return normalized
|
||||
|
||||
|
||||
def _normalize_amount(value: str) -> dict[str, Any]:
|
||||
"""Normalize monetary amount"""
|
||||
import re
|
||||
|
||||
if not value:
|
||||
return {"amount": None, "currency": settings.currency_default}
|
||||
|
||||
# Remove currency symbols and formatting
|
||||
clean_value = re.sub(r"[£$€,\s]", "", str(value))
|
||||
|
||||
try:
|
||||
amount = Decimal(clean_value)
|
||||
|
||||
# Validate amount
|
||||
if amount > settings.max_amount:
|
||||
logger.warning("Amount exceeds maximum", amount=amount)
|
||||
|
||||
return {
|
||||
"amount": float(amount),
|
||||
"currency": settings.currency_default,
|
||||
"original": value,
|
||||
}
|
||||
except Exception:
|
||||
return {
|
||||
"amount": None,
|
||||
"currency": settings.currency_default,
|
||||
"original": value,
|
||||
}
|
||||
|
||||
|
||||
def _normalize_date(value: str) -> dict[str, Any]:
|
||||
"""Normalize date"""
|
||||
from dateutil import parser
|
||||
|
||||
if not value:
|
||||
return {"date": None, "original": value}
|
||||
|
||||
try:
|
||||
# Try parsing with dateutil first
|
||||
parsed_date = parser.parse(str(value), dayfirst=True)
|
||||
return {"date": parsed_date.date().isoformat(), "original": value}
|
||||
except Exception:
|
||||
# Try manual formats
|
||||
for fmt in settings.date_formats:
|
||||
try:
|
||||
parsed_date = datetime.strptime(str(value), fmt)
|
||||
return {"date": parsed_date.date().isoformat(), "original": value}
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
return {"date": None, "original": value}
|
||||
|
||||
|
||||
def _normalize_name(value: str) -> dict[str, Any]:
|
||||
"""Normalize person/company name"""
|
||||
if not value:
|
||||
return {"name": None, "original": value}
|
||||
|
||||
# Clean and title case
|
||||
clean_name = str(value).strip().title()
|
||||
|
||||
# Detect if it's a company (contains Ltd, Limited, etc.)
|
||||
company_indicators = ["Ltd", "Limited", "Plc", "Inc", "Corp", "Company"]
|
||||
is_company = any(indicator in clean_name for indicator in company_indicators)
|
||||
|
||||
return {
|
||||
"name": clean_name,
|
||||
"type": "company" if is_company else "person",
|
||||
"original": value,
|
||||
}
|
||||
|
||||
|
||||
def _normalize_address(value: str) -> dict[str, Any]:
|
||||
"""Normalize address"""
|
||||
import re
|
||||
|
||||
if not value:
|
||||
return {"address": None, "original": value}
|
||||
|
||||
clean_address = str(value).strip()
|
||||
|
||||
# Extract UK postcode
|
||||
postcode_pattern = r"\b[A-Z]{1,2}\d[A-Z0-9]?\s*\d[A-Z]{2}\b"
|
||||
postcode_match = re.search(postcode_pattern, clean_address, re.IGNORECASE)
|
||||
postcode = postcode_match.group().upper() if postcode_match else None
|
||||
|
||||
return {"address": clean_address, "postcode": postcode, "original": value}
|
||||
|
||||
|
||||
def _normalize_number(value: str) -> dict[str, Any]:
|
||||
"""Normalize reference numbers"""
|
||||
import re
|
||||
|
||||
if not value:
|
||||
return {"number": None, "original": value}
|
||||
|
||||
# Remove spaces and special characters
|
||||
clean_number = re.sub(r"[^\w]", "", str(value))
|
||||
|
||||
# Detect number type
|
||||
number_type = "unknown"
|
||||
if len(clean_number) == 10 and clean_number.isdigit():
|
||||
number_type = "utr" # UTR is 10 digits
|
||||
elif len(clean_number) == 8 and clean_number.isdigit():
|
||||
number_type = "account_number"
|
||||
elif re.match(r"^\d{6}$", clean_number):
|
||||
number_type = "sort_code"
|
||||
|
||||
return {"number": clean_number, "type": number_type, "original": value}
|
||||
|
||||
|
||||
def _normalize_text(value: str) -> dict[str, Any]:
|
||||
"""Normalize general text"""
|
||||
if not value:
|
||||
return {"text": None, "original": value}
|
||||
|
||||
clean_text = str(value).strip()
|
||||
|
||||
return {"text": clean_text, "original": value}
|
||||
|
||||
|
||||
async def _map_to_entities(
|
||||
normalized_data: dict[str, Any], doc_id: str, tenant_id: str
|
||||
) -> list[dict[str, Any]]:
|
||||
"""Map normalized data to knowledge graph entities"""
|
||||
|
||||
entities = []
|
||||
|
||||
# Create document entity
|
||||
doc_entity = {
|
||||
"type": "Document",
|
||||
"id": doc_id,
|
||||
"properties": {
|
||||
"doc_id": doc_id,
|
||||
"tenant_id": tenant_id,
|
||||
"processed_at": datetime.utcnow().isoformat(),
|
||||
"source": "extraction",
|
||||
"extractor_version": "1.0.0",
|
||||
"valid_from": datetime.utcnow(),
|
||||
"asserted_at": datetime.utcnow(),
|
||||
},
|
||||
}
|
||||
entities.append(doc_entity)
|
||||
|
||||
# Map specific field types to entities
|
||||
for field_name, normalized_value in normalized_data.items():
|
||||
if isinstance(normalized_value, dict):
|
||||
if "amount" in normalized_value and normalized_value["amount"] is not None:
|
||||
# Create expense or income item
|
||||
entity_type = (
|
||||
"ExpenseItem" if "expense" in field_name.lower() else "IncomeItem"
|
||||
)
|
||||
entity = {
|
||||
"type": entity_type,
|
||||
"id": f"{entity_type.lower()}_{ulid.new()}",
|
||||
"properties": {
|
||||
"amount": normalized_value["amount"],
|
||||
"currency": normalized_value["currency"],
|
||||
"description": field_name,
|
||||
"source": doc_id,
|
||||
"extractor_version": "1.0.0",
|
||||
"valid_from": datetime.utcnow(),
|
||||
"asserted_at": datetime.utcnow(),
|
||||
},
|
||||
}
|
||||
entities.append(entity)
|
||||
|
||||
elif "name" in normalized_value and normalized_value["name"] is not None:
|
||||
# Create party entity
|
||||
entity = {
|
||||
"type": "Party",
|
||||
"id": f"party_{ulid.new()}",
|
||||
"properties": {
|
||||
"name": normalized_value["name"],
|
||||
"party_type": normalized_value.get("type", "unknown"),
|
||||
"source": doc_id,
|
||||
"extractor_version": "1.0.0",
|
||||
"valid_from": datetime.utcnow(),
|
||||
"asserted_at": datetime.utcnow(),
|
||||
},
|
||||
}
|
||||
entities.append(entity)
|
||||
|
||||
return entities
|
||||
|
||||
|
||||
async def _store_entities(
|
||||
entities: list[dict[str, Any]], tenant_id: str
|
||||
) -> list[dict[str, Any]]:
|
||||
"""Store entities in knowledge graph"""
|
||||
|
||||
stored_entities = []
|
||||
|
||||
for entity in entities:
|
||||
try:
|
||||
# Create node in Neo4j
|
||||
result = await neo4j_client.create_node(
|
||||
label=entity["type"], properties=entity["properties"]
|
||||
)
|
||||
|
||||
stored_entities.append(
|
||||
{
|
||||
"type": entity["type"],
|
||||
"id": entity["id"],
|
||||
"neo4j_id": result.get("id"),
|
||||
"properties": entity["properties"],
|
||||
}
|
||||
)
|
||||
|
||||
logger.debug("Entity stored", type=entity["type"], id=entity["id"])
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Failed to store entity", entity=entity, error=str(e))
|
||||
|
||||
return stored_entities
|
||||
|
||||
|
||||
|
||||
@app.exception_handler(HTTPException)
|
||||
async def http_exception_handler(request: Request, exc: HTTPException) -> JSONResponse:
|
||||
"""Handle HTTP exceptions with RFC7807 format"""
|
||||
return JSONResponse(
|
||||
status_code=exc.status_code,
|
||||
content=ErrorResponse(
|
||||
type=f"https://httpstatuses.com/{exc.status_code}",
|
||||
title=exc.detail,
|
||||
status=exc.status_code,
|
||||
detail=exc.detail,
|
||||
instance=str(request.url),
|
||||
trace_id="",
|
||||
).dict(),
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import uvicorn
|
||||
|
||||
uvicorn.run("main:app", host="0.0.0.0", port=8004, reload=True, log_config=None)
|
||||
37
apps/svc_normalize_map/requirements.txt
Normal file
37
apps/svc_normalize_map/requirements.txt
Normal file
@@ -0,0 +1,37 @@
|
||||
# FastAPI and server
|
||||
fastapi>=0.104.1
|
||||
uvicorn[standard]>=0.24.0
|
||||
pydantic>=2.5.0
|
||||
|
||||
# Service-specific dependencies
|
||||
# Data normalization and cleaning
|
||||
pandas>=2.1.0
|
||||
numpy>=1.24.0
|
||||
|
||||
# Currency and exchange rates
|
||||
forex-python>=1.8
|
||||
babel>=2.13.0
|
||||
|
||||
# Date and time processing
|
||||
python-dateutil>=2.8.0
|
||||
pytz>=2023.3
|
||||
|
||||
# Text normalization
|
||||
unidecode>=1.3.0
|
||||
phonenumbers>=8.13.0
|
||||
|
||||
# Entity resolution and matching
|
||||
recordlinkage>=0.16.0
|
||||
fuzzywuzzy>=0.18.0
|
||||
python-Levenshtein>=0.23.0
|
||||
|
||||
# Geographic data
|
||||
geopy>=2.4.0
|
||||
pycountry>=23.12.0
|
||||
|
||||
# Data validation
|
||||
cerberus>=1.3.4
|
||||
marshmallow>=3.20.0
|
||||
|
||||
# UK-specific utilities
|
||||
uk-postcode-utils>=1.0.0
|
||||
43
apps/svc_ocr/Dockerfile
Normal file
43
apps/svc_ocr/Dockerfile
Normal file
@@ -0,0 +1,43 @@
|
||||
# Dockerfile for svc_ocr - Uses base-ml image
|
||||
# Base image contains: FastAPI, database drivers, transformers, PyTorch, numpy, etc.
|
||||
# This Dockerfile adds OCR-specific dependencies and application code
|
||||
|
||||
ARG REGISTRY=gitea.harkon.co.uk
|
||||
ARG OWNER=harkon
|
||||
ARG BASE_VERSION=v1.0.1
|
||||
FROM ${REGISTRY}/${OWNER}/base-ml:${BASE_VERSION}
|
||||
|
||||
# Switch to root to install system and service-specific dependencies
|
||||
USER root
|
||||
|
||||
# Install OCR runtime dependencies (Tesseract, poppler)
|
||||
RUN apt-get update && apt-get install -y \
|
||||
tesseract-ocr \
|
||||
tesseract-ocr-eng \
|
||||
poppler-utils \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Set working directory
|
||||
WORKDIR /app
|
||||
|
||||
# Copy service-specific requirements and install
|
||||
COPY apps/svc_ocr/requirements.txt /tmp/service-requirements.txt
|
||||
RUN pip install --no-cache-dir -r /tmp/service-requirements.txt
|
||||
|
||||
# Copy application code
|
||||
COPY libs/ ./libs/
|
||||
COPY apps/svc_ocr/ ./apps/svc_ocr/
|
||||
|
||||
# Set permissions and switch to non-root user
|
||||
RUN chown -R appuser:appuser /app
|
||||
USER appuser
|
||||
|
||||
# Health check
|
||||
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
|
||||
CMD curl -f http://localhost:8000/healthz || exit 1
|
||||
|
||||
# Expose port
|
||||
EXPOSE 8000
|
||||
|
||||
# Run the application
|
||||
CMD ["python", "-m", "uvicorn", "apps.svc_ocr.main:app", "--host", "0.0.0.0", "--port", "8000"]
|
||||
504
apps/svc_ocr/main.py
Normal file
504
apps/svc_ocr/main.py
Normal file
@@ -0,0 +1,504 @@
|
||||
# FILE: apps/svc-ocr/main.py
|
||||
# OCR and layout extraction using Tesseract, LayoutLM, and document AI
|
||||
|
||||
import os
|
||||
|
||||
# Import shared libraries
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from typing import Any
|
||||
|
||||
import structlog
|
||||
import ulid
|
||||
from fastapi import BackgroundTasks, Depends, HTTPException, Request
|
||||
from fastapi.responses import JSONResponse
|
||||
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
|
||||
|
||||
from libs.app_factory import create_app
|
||||
from libs.config import BaseAppSettings, create_event_bus, create_minio_client
|
||||
from libs.events import EventBus, EventPayload, EventTopics
|
||||
from libs.observability import get_metrics, get_tracer, setup_observability
|
||||
from libs.schemas import ErrorResponse
|
||||
from libs.security import get_current_user, get_tenant_id
|
||||
from libs.storage import DocumentStorage, StorageClient
|
||||
|
||||
logger = structlog.get_logger()
|
||||
|
||||
|
||||
class OCRSettings(BaseAppSettings):
|
||||
"""Settings for OCR service"""
|
||||
|
||||
service_name: str = "svc-ocr"
|
||||
|
||||
# OCR configuration
|
||||
tesseract_cmd: str = "/usr/bin/tesseract"
|
||||
tesseract_config: str = "--oem 3 --psm 6"
|
||||
languages: str = "eng"
|
||||
|
||||
# Layout analysis
|
||||
layoutlm_model: str = "microsoft/layoutlm-base-uncased"
|
||||
confidence_threshold: float = 0.7
|
||||
|
||||
# Processing limits
|
||||
max_pages: int = 50
|
||||
max_file_size: int = 100 * 1024 * 1024 # 100MB
|
||||
|
||||
# Output configuration
|
||||
include_coordinates: bool = True
|
||||
include_confidence: bool = True
|
||||
|
||||
|
||||
# Create app and settings
|
||||
app, settings = create_app(
|
||||
service_name="svc-ocr",
|
||||
title="Tax Agent OCR Service",
|
||||
description="OCR and layout extraction service",
|
||||
settings_class=OCRSettings,
|
||||
) # fmt: skip
|
||||
|
||||
# Global clients
|
||||
storage_client: StorageClient | None = None
|
||||
document_storage: DocumentStorage | None = None
|
||||
event_bus: EventBus | None = None
|
||||
tracer = get_tracer("svc-ocr")
|
||||
metrics = get_metrics()
|
||||
|
||||
|
||||
@app.on_event("startup")
|
||||
async def startup_event() -> None:
|
||||
"""Initialize service dependencies"""
|
||||
global storage_client, document_storage, event_bus
|
||||
|
||||
logger.info("Starting OCR service")
|
||||
|
||||
# Setup observability
|
||||
setup_observability(settings)
|
||||
|
||||
# Initialize MinIO client
|
||||
minio_client = create_minio_client(settings)
|
||||
storage_client = StorageClient(minio_client)
|
||||
document_storage = DocumentStorage(storage_client)
|
||||
|
||||
# Initialize event bus
|
||||
event_bus = create_event_bus(settings)
|
||||
if not event_bus:
|
||||
raise HTTPException(status_code=500, detail="Event bus not initialized")
|
||||
|
||||
await event_bus.start()
|
||||
|
||||
# Subscribe to document ingestion events
|
||||
await event_bus.subscribe(EventTopics.DOC_INGESTED, _handle_document_ingested)
|
||||
|
||||
logger.info("OCR service started successfully")
|
||||
|
||||
|
||||
@app.on_event("shutdown")
|
||||
async def shutdown_event() -> None:
|
||||
"""Cleanup service dependencies"""
|
||||
global event_bus
|
||||
|
||||
logger.info("Shutting down OCR service")
|
||||
|
||||
if event_bus:
|
||||
await event_bus.stop()
|
||||
|
||||
logger.info("OCR service shutdown complete")
|
||||
|
||||
|
||||
@app.get("/health")
|
||||
async def health_check() -> dict[str, Any]:
|
||||
"""Health check endpoint"""
|
||||
return {
|
||||
"status": "healthy",
|
||||
"service": settings.service_name,
|
||||
"version": settings.service_version,
|
||||
"timestamp": datetime.utcnow().isoformat(),
|
||||
}
|
||||
|
||||
|
||||
@app.post("/process/{doc_id}")
|
||||
async def process_document(
|
||||
doc_id: str,
|
||||
background_tasks: BackgroundTasks,
|
||||
strategy: str = "hybrid",
|
||||
current_user: dict[str, Any] = Depends(get_current_user),
|
||||
tenant_id: str = Depends(get_tenant_id),
|
||||
) -> dict[str, Any]:
|
||||
"""Process document with OCR"""
|
||||
|
||||
with tracer.start_as_current_span("process_document") as span:
|
||||
span.set_attribute("doc_id", doc_id)
|
||||
span.set_attribute("tenant_id", tenant_id)
|
||||
span.set_attribute("strategy", strategy)
|
||||
|
||||
try:
|
||||
# Check if document exists
|
||||
doc_content = await document_storage.get_document(tenant_id, doc_id)
|
||||
if not doc_content:
|
||||
raise HTTPException(status_code=404, detail="Document not found")
|
||||
|
||||
# Generate processing ID
|
||||
processing_id = str(ulid.new())
|
||||
span.set_attribute("processing_id", processing_id)
|
||||
|
||||
# Start background processing
|
||||
background_tasks.add_task(
|
||||
_process_document_async,
|
||||
doc_id,
|
||||
tenant_id,
|
||||
doc_content,
|
||||
strategy,
|
||||
processing_id,
|
||||
current_user.get("sub", "system"),
|
||||
)
|
||||
|
||||
logger.info(
|
||||
"OCR processing started", doc_id=doc_id, processing_id=processing_id
|
||||
)
|
||||
|
||||
return {
|
||||
"processing_id": processing_id,
|
||||
"doc_id": doc_id,
|
||||
"status": "processing",
|
||||
"strategy": strategy,
|
||||
}
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error("Failed to start OCR processing", doc_id=doc_id, error=str(e))
|
||||
raise HTTPException(status_code=500, detail="Failed to start processing")
|
||||
|
||||
|
||||
@app.get("/results/{doc_id}")
|
||||
async def get_ocr_results(
|
||||
doc_id: str,
|
||||
current_user: dict[str, Any] = Depends(get_current_user),
|
||||
tenant_id: str = Depends(get_tenant_id),
|
||||
) -> dict[str, Any]:
|
||||
"""Get OCR results for document"""
|
||||
|
||||
with tracer.start_as_current_span("get_ocr_results") as span:
|
||||
span.set_attribute("doc_id", doc_id)
|
||||
span.set_attribute("tenant_id", tenant_id)
|
||||
|
||||
try:
|
||||
# Get OCR results from storage
|
||||
ocr_results = await document_storage.get_ocr_result(tenant_id, doc_id)
|
||||
|
||||
if not ocr_results:
|
||||
raise HTTPException(status_code=404, detail="OCR results not found")
|
||||
|
||||
return ocr_results
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error("Failed to get OCR results", doc_id=doc_id, error=str(e))
|
||||
raise HTTPException(status_code=500, detail="Failed to get OCR results")
|
||||
|
||||
|
||||
async def _handle_document_ingested(topic: str, payload: EventPayload) -> None:
|
||||
"""Handle document ingestion events"""
|
||||
try:
|
||||
data = payload.data
|
||||
doc_id = data.get("doc_id")
|
||||
tenant_id = data.get("tenant_id")
|
||||
|
||||
if not doc_id or not tenant_id:
|
||||
logger.warning("Invalid document ingestion event", data=data)
|
||||
return
|
||||
|
||||
# Auto-process PDF documents
|
||||
if data.get("content_type") == "application/pdf":
|
||||
logger.info("Auto-processing ingested document", doc_id=doc_id)
|
||||
|
||||
# Get document content
|
||||
doc_content = await document_storage.get_document(tenant_id, doc_id)
|
||||
if doc_content:
|
||||
await _process_document_async(
|
||||
doc_id=doc_id,
|
||||
tenant_id=tenant_id,
|
||||
content=doc_content,
|
||||
strategy="hybrid",
|
||||
processing_id=str(ulid.new()),
|
||||
actor=payload.actor,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Failed to handle document ingestion", error=str(e))
|
||||
|
||||
|
||||
async def _process_document_async(
|
||||
doc_id: str,
|
||||
tenant_id: str,
|
||||
content: bytes,
|
||||
strategy: str,
|
||||
processing_id: str,
|
||||
actor: str,
|
||||
) -> None:
|
||||
"""Process document asynchronously"""
|
||||
|
||||
with tracer.start_as_current_span("process_document_async") as span:
|
||||
span.set_attribute("doc_id", doc_id)
|
||||
span.set_attribute("processing_id", processing_id)
|
||||
span.set_attribute("strategy", strategy)
|
||||
|
||||
try:
|
||||
# Convert PDF to images
|
||||
images = await _pdf_to_images(content)
|
||||
|
||||
# Process each page
|
||||
pages_data: list[Any] = []
|
||||
for page_num, image in enumerate(images, 1):
|
||||
page_data = await _process_page(image, page_num, strategy)
|
||||
pages_data.append(page_data)
|
||||
|
||||
# Combine results
|
||||
ocr_results = {
|
||||
"doc_id": doc_id,
|
||||
"processing_id": processing_id,
|
||||
"strategy": strategy,
|
||||
"processed_at": datetime.utcnow().isoformat(),
|
||||
"total_pages": len(pages_data),
|
||||
"pages": pages_data,
|
||||
"metadata": {
|
||||
"confidence_threshold": settings.confidence_threshold,
|
||||
"languages": settings.languages,
|
||||
},
|
||||
}
|
||||
|
||||
# Store results
|
||||
await document_storage.store_ocr_result(tenant_id, doc_id, ocr_results)
|
||||
|
||||
# Update metrics
|
||||
metrics.counter("documents_processed_total").labels(
|
||||
tenant_id=tenant_id, strategy=strategy
|
||||
).inc()
|
||||
|
||||
metrics.histogram("processing_duration_seconds").labels(
|
||||
strategy=strategy
|
||||
).observe(
|
||||
datetime.utcnow().timestamp()
|
||||
- datetime.fromisoformat(
|
||||
ocr_results["processed_at"].replace("Z", "")
|
||||
).timestamp()
|
||||
)
|
||||
|
||||
# Publish completion event
|
||||
event_payload = EventPayload(
|
||||
data={
|
||||
"doc_id": doc_id,
|
||||
"tenant_id": tenant_id,
|
||||
"processing_id": processing_id,
|
||||
"strategy": strategy,
|
||||
"total_pages": len(pages_data),
|
||||
"ocr_results": ocr_results,
|
||||
},
|
||||
actor=actor,
|
||||
tenant_id=tenant_id,
|
||||
)
|
||||
|
||||
await event_bus.publish(EventTopics.DOC_OCR_READY, event_payload)
|
||||
|
||||
logger.info(
|
||||
"OCR processing completed", doc_id=doc_id, pages=len(pages_data)
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error("OCR processing failed", doc_id=doc_id, error=str(e))
|
||||
|
||||
# Update error metrics
|
||||
metrics.counter("processing_errors_total").labels(
|
||||
tenant_id=tenant_id, strategy=strategy, error_type=type(e).__name__
|
||||
).inc()
|
||||
|
||||
|
||||
async def _pdf_to_images(pdf_content: bytes) -> list[bytes]:
|
||||
"""Convert PDF to images"""
|
||||
try:
|
||||
import fitz # PyMuPDF
|
||||
|
||||
# Open PDF
|
||||
pdf_doc = fitz.open(stream=pdf_content, filetype="pdf")
|
||||
|
||||
images: list[Any] = []
|
||||
for page_num in range(min(len(pdf_doc), settings.max_pages)):
|
||||
page = pdf_doc[page_num]
|
||||
|
||||
# Render page to image
|
||||
mat = fitz.Matrix(2.0, 2.0) # 2x zoom for better OCR
|
||||
pix = page.get_pixmap(matrix=mat)
|
||||
img_data = pix.tobytes("png")
|
||||
|
||||
images.append(img_data)
|
||||
|
||||
pdf_doc.close()
|
||||
return images
|
||||
|
||||
except ImportError:
|
||||
logger.error("PyMuPDF not available, using fallback")
|
||||
return await _pdf_to_images_fallback(pdf_content)
|
||||
except Exception as e:
|
||||
logger.error("PDF conversion failed", error=str(e))
|
||||
raise
|
||||
|
||||
|
||||
async def _pdf_to_images_fallback(pdf_content: bytes) -> list[bytes]:
|
||||
"""Fallback PDF to images conversion"""
|
||||
try:
|
||||
from pdf2image import convert_from_bytes
|
||||
|
||||
images = convert_from_bytes(
|
||||
pdf_content, dpi=200, first_page=1, last_page=settings.max_pages
|
||||
)
|
||||
|
||||
# Convert PIL images to bytes
|
||||
image_bytes: list[Any] = []
|
||||
for img in images:
|
||||
import io
|
||||
|
||||
img_buffer = io.BytesIO()
|
||||
img.save(img_buffer, format="PNG")
|
||||
image_bytes.append(img_buffer.getvalue())
|
||||
|
||||
return image_bytes
|
||||
|
||||
except ImportError:
|
||||
logger.error("pdf2image not available")
|
||||
raise Exception("No PDF conversion library available")
|
||||
|
||||
|
||||
async def _process_page(
|
||||
image_data: bytes, page_num: int, strategy: str
|
||||
) -> dict[str, Any]:
|
||||
"""Process single page with OCR"""
|
||||
|
||||
if strategy == "tesseract":
|
||||
return await _process_with_tesseract(image_data, page_num)
|
||||
elif strategy == "layoutlm":
|
||||
return await _process_with_layoutlm(image_data, page_num)
|
||||
elif strategy == "hybrid":
|
||||
# Combine both approaches
|
||||
tesseract_result = await _process_with_tesseract(image_data, page_num)
|
||||
layoutlm_result = await _process_with_layoutlm(image_data, page_num)
|
||||
|
||||
return {
|
||||
"page": page_num,
|
||||
"strategy": "hybrid",
|
||||
"tesseract": tesseract_result,
|
||||
"layoutlm": layoutlm_result,
|
||||
"text": tesseract_result.get("text", ""),
|
||||
"confidence": max(
|
||||
tesseract_result.get("confidence", 0),
|
||||
layoutlm_result.get("confidence", 0),
|
||||
),
|
||||
}
|
||||
else:
|
||||
raise ValueError(f"Unknown strategy: {strategy}")
|
||||
|
||||
|
||||
async def _process_with_tesseract(image_data: bytes, page_num: int) -> dict[str, Any]:
|
||||
"""Process page with Tesseract OCR"""
|
||||
try:
|
||||
import io
|
||||
|
||||
import pytesseract
|
||||
from PIL import Image
|
||||
|
||||
# Load image
|
||||
image = Image.open(io.BytesIO(image_data))
|
||||
|
||||
# Configure Tesseract
|
||||
config = f"{settings.tesseract_config} -l {settings.languages}"
|
||||
|
||||
# Extract text with confidence
|
||||
data = pytesseract.image_to_data(
|
||||
image, config=config, output_type=pytesseract.Output.DICT
|
||||
)
|
||||
|
||||
# Process results
|
||||
words: list[Any] = []
|
||||
confidences: list[Any] = []
|
||||
|
||||
for i in range(len(data["text"])):
|
||||
if int(data["conf"][i]) > 0: # Valid confidence
|
||||
word_data = {
|
||||
"text": data["text"][i],
|
||||
"confidence": int(data["conf"][i]) / 100.0,
|
||||
"bbox": [
|
||||
data["left"][i],
|
||||
data["top"][i],
|
||||
data["left"][i] + data["width"][i],
|
||||
data["top"][i] + data["height"][i],
|
||||
],
|
||||
}
|
||||
words.append(word_data)
|
||||
confidences.append(word_data["confidence"])
|
||||
|
||||
# Extract full text
|
||||
full_text = pytesseract.image_to_string(image, config=config)
|
||||
|
||||
return {
|
||||
"page": page_num,
|
||||
"strategy": "tesseract",
|
||||
"text": full_text.strip(),
|
||||
"words": words,
|
||||
"confidence": sum(confidences) / len(confidences) if confidences else 0.0,
|
||||
"word_count": len(words),
|
||||
}
|
||||
|
||||
except ImportError:
|
||||
logger.error("pytesseract not available")
|
||||
return {
|
||||
"page": page_num,
|
||||
"strategy": "tesseract",
|
||||
"error": "pytesseract not available",
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error("Tesseract processing failed", page=page_num, error=str(e))
|
||||
return {"page": page_num, "strategy": "tesseract", "error": str(e)}
|
||||
|
||||
|
||||
async def _process_with_layoutlm(image_data: bytes, page_num: int) -> dict[str, Any]:
|
||||
"""Process page with LayoutLM"""
|
||||
try:
|
||||
# This would integrate with LayoutLM model
|
||||
# For now, return placeholder
|
||||
logger.warning("LayoutLM processing not implemented")
|
||||
|
||||
return {
|
||||
"page": page_num,
|
||||
"strategy": "layoutlm",
|
||||
"text": "",
|
||||
"layout_elements": [],
|
||||
"confidence": 0.0,
|
||||
"error": "Not implemented",
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error("LayoutLM processing failed", page=page_num, error=str(e))
|
||||
return {"page": page_num, "strategy": "layoutlm", "error": str(e)}
|
||||
|
||||
|
||||
@app.exception_handler(HTTPException)
|
||||
async def http_exception_handler(request: Request, exc: HTTPException) -> JSONResponse:
|
||||
"""Handle HTTP exceptions with RFC7807 format"""
|
||||
return JSONResponse(
|
||||
status_code=exc.status_code,
|
||||
content=ErrorResponse(
|
||||
type=f"https://httpstatuses.com/{exc.status_code}",
|
||||
title=exc.detail,
|
||||
status=exc.status_code,
|
||||
detail=exc.detail,
|
||||
instance=str(request.url),
|
||||
trace_id="",
|
||||
).model_dump(),
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import uvicorn
|
||||
|
||||
uvicorn.run("main:app", host="0.0.0.0", port=8002, reload=True, log_config=None)
|
||||
16
apps/svc_ocr/requirements.txt
Normal file
16
apps/svc_ocr/requirements.txt
Normal file
@@ -0,0 +1,16 @@
|
||||
# Service-specific dependencies for svc_ocr
|
||||
# NOTE: ML dependencies (transformers, torch, numpy) are in base-ml image
|
||||
|
||||
# OCR engines (lightweight)
|
||||
pytesseract>=0.3.13
|
||||
|
||||
# PDF processing
|
||||
PyMuPDF>=1.26.4
|
||||
pdf2image>=1.17.0
|
||||
|
||||
# Image processing
|
||||
Pillow>=11.3.0
|
||||
opencv-python-headless>=4.12.0.88 # Headless version is smaller
|
||||
|
||||
# Computer vision (torchvision not in base-ml)
|
||||
torchvision>=0.23.0
|
||||
36
apps/svc_rag_indexer/Dockerfile
Normal file
36
apps/svc_rag_indexer/Dockerfile
Normal file
@@ -0,0 +1,36 @@
|
||||
# Dockerfile for svc_rag_indexer - Uses base-ml image
|
||||
# Base image contains: FastAPI, database drivers, sentence-transformers, PyTorch, numpy, etc.
|
||||
# This Dockerfile only adds service-specific dependencies and application code
|
||||
|
||||
ARG REGISTRY=gitea.harkon.co.uk
|
||||
ARG OWNER=harkon
|
||||
ARG BASE_VERSION=v1.0.1
|
||||
FROM ${REGISTRY}/${OWNER}/base-ml:${BASE_VERSION}
|
||||
|
||||
# Switch to root to install service-specific dependencies
|
||||
USER root
|
||||
|
||||
# Set working directory
|
||||
WORKDIR /app
|
||||
|
||||
# Copy service-specific requirements and install
|
||||
COPY apps/svc_rag_indexer/requirements.txt /tmp/service-requirements.txt
|
||||
RUN pip install --no-cache-dir -r /tmp/service-requirements.txt
|
||||
|
||||
# Copy application code
|
||||
COPY libs/ ./libs/
|
||||
COPY apps/svc_rag_indexer/ ./apps/svc_rag_indexer/
|
||||
|
||||
# Set permissions and switch to non-root user
|
||||
RUN chown -R appuser:appuser /app
|
||||
USER appuser
|
||||
|
||||
# Health check
|
||||
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
|
||||
CMD curl -f http://localhost:8000/healthz || exit 1
|
||||
|
||||
# Expose port
|
||||
EXPOSE 8000
|
||||
|
||||
# Run the application
|
||||
CMD ["python", "-m", "uvicorn", "apps.svc_rag_indexer.main:app", "--host", "0.0.0.0", "--port", "8000"]
|
||||
535
apps/svc_rag_indexer/main.py
Normal file
535
apps/svc_rag_indexer/main.py
Normal file
@@ -0,0 +1,535 @@
|
||||
# FILE: apps/svc-rag-indexer/main.py
|
||||
# mypy: disable-error-code=union-attr
|
||||
# Vector database indexing with PII protection and de-identification
|
||||
|
||||
import os
|
||||
|
||||
# Import shared libraries
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from typing import Any
|
||||
|
||||
import structlog
|
||||
import ulid
|
||||
from fastapi import BackgroundTasks, Depends, HTTPException, Request
|
||||
from fastapi.responses import JSONResponse
|
||||
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
|
||||
|
||||
from libs.app_factory import create_app
|
||||
from libs.config import BaseAppSettings, create_event_bus, create_qdrant_client
|
||||
from libs.events import EventBus, EventPayload, EventTopics
|
||||
from libs.observability import get_metrics, get_tracer, setup_observability
|
||||
from libs.rag import PIIDetector, QdrantCollectionManager
|
||||
from libs.schemas import ErrorResponse
|
||||
from libs.security import get_current_user, get_tenant_id
|
||||
|
||||
logger = structlog.get_logger()
|
||||
|
||||
|
||||
class RAGIndexerSettings(BaseAppSettings):
|
||||
"""Settings for RAG indexer service"""
|
||||
|
||||
service_name: str = "svc-rag-indexer"
|
||||
|
||||
# Embedding configuration
|
||||
embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2"
|
||||
embedding_dimension: int = 384
|
||||
|
||||
# Chunking configuration
|
||||
chunk_size: int = 512
|
||||
chunk_overlap: int = 50
|
||||
|
||||
# Collection configuration
|
||||
collections: dict[str, str] = {
|
||||
"documents": "Document chunks with metadata",
|
||||
"tax_rules": "Tax rules and regulations",
|
||||
"case_law": "Tax case law and precedents",
|
||||
"guidance": "HMRC guidance and manuals",
|
||||
}
|
||||
|
||||
# PII protection
|
||||
require_pii_free: bool = True
|
||||
auto_deidentify: bool = True
|
||||
|
||||
|
||||
# Create app and settings
|
||||
app, settings = create_app(
|
||||
service_name="svc-rag-indexer",
|
||||
title="Tax Agent RAG Indexer Service",
|
||||
description="Vector database indexing with PII protection",
|
||||
settings_class=RAGIndexerSettings,
|
||||
)
|
||||
|
||||
# Global clients
|
||||
qdrant_client = None
|
||||
collection_manager: QdrantCollectionManager | None = None
|
||||
pii_detector: PIIDetector | None = None
|
||||
event_bus: EventBus | None = None
|
||||
embedding_model = None
|
||||
tracer = get_tracer("svc-rag-indexer")
|
||||
metrics = get_metrics()
|
||||
|
||||
|
||||
@app.on_event("startup")
|
||||
async def startup_event() -> None:
|
||||
"""Initialize service dependencies"""
|
||||
global qdrant_client, collection_manager, pii_detector, event_bus, embedding_model
|
||||
|
||||
logger.info("Starting RAG indexer service")
|
||||
|
||||
# Setup observability
|
||||
setup_observability(settings)
|
||||
|
||||
# Initialize Qdrant client
|
||||
qdrant_client = create_qdrant_client(settings)
|
||||
collection_manager = QdrantCollectionManager(qdrant_client)
|
||||
|
||||
# Initialize PII detector
|
||||
pii_detector = PIIDetector()
|
||||
|
||||
# Initialize embedding model
|
||||
try:
|
||||
from sentence_transformers import SentenceTransformer
|
||||
|
||||
embedding_model = SentenceTransformer(settings.embedding_model)
|
||||
logger.info("Embedding model loaded", model=settings.embedding_model)
|
||||
except ImportError:
|
||||
logger.warning("sentence-transformers not available, using mock embeddings")
|
||||
embedding_model = None
|
||||
|
||||
# Initialize event bus
|
||||
event_bus = create_event_bus(settings)
|
||||
await event_bus.start()
|
||||
|
||||
# Subscribe to relevant events
|
||||
await event_bus.subscribe(EventTopics.DOC_EXTRACTED, _handle_document_extracted) # type: ignore
|
||||
await event_bus.subscribe(EventTopics.KG_UPSERTED, _handle_kg_upserted) # type: ignore
|
||||
|
||||
# Ensure collections exist
|
||||
for collection_name in settings.collections:
|
||||
await collection_manager.ensure_collection(
|
||||
collection_name=collection_name, vector_size=settings.embedding_dimension
|
||||
)
|
||||
|
||||
logger.info("RAG indexer service started successfully")
|
||||
|
||||
|
||||
@app.on_event("shutdown")
|
||||
async def shutdown_event() -> None:
|
||||
"""Cleanup service dependencies"""
|
||||
global event_bus
|
||||
|
||||
logger.info("Shutting down RAG indexer service")
|
||||
|
||||
if event_bus:
|
||||
await event_bus.stop()
|
||||
|
||||
logger.info("RAG indexer service shutdown complete")
|
||||
|
||||
|
||||
@app.get("/health")
|
||||
async def health_check() -> dict[str, Any]:
|
||||
"""Health check endpoint"""
|
||||
return {
|
||||
"status": "healthy",
|
||||
"service": settings.service_name,
|
||||
"version": settings.service_version,
|
||||
"timestamp": datetime.utcnow().isoformat(),
|
||||
"collections": list(settings.collections.keys()),
|
||||
}
|
||||
|
||||
|
||||
@app.post("/index/{collection_name}")
|
||||
async def index_document(
|
||||
collection_name: str,
|
||||
document: dict[str, Any],
|
||||
background_tasks: BackgroundTasks,
|
||||
current_user: dict[str, Any] = Depends(get_current_user),
|
||||
tenant_id: str = Depends(get_tenant_id),
|
||||
):
|
||||
"""Index document in vector database"""
|
||||
|
||||
with tracer.start_as_current_span("index_document") as span:
|
||||
span.set_attribute("collection_name", collection_name)
|
||||
span.set_attribute("tenant_id", tenant_id)
|
||||
|
||||
try:
|
||||
# Validate collection
|
||||
if collection_name not in settings.collections:
|
||||
raise HTTPException(
|
||||
status_code=400, detail=f"Unknown collection: {collection_name}"
|
||||
)
|
||||
|
||||
# Generate indexing ID
|
||||
indexing_id = str(ulid.new())
|
||||
span.set_attribute("indexing_id", indexing_id)
|
||||
|
||||
# Start background indexing
|
||||
background_tasks.add_task(
|
||||
_index_document_async,
|
||||
collection_name,
|
||||
document,
|
||||
tenant_id,
|
||||
indexing_id,
|
||||
current_user.get("sub", "system"),
|
||||
)
|
||||
|
||||
logger.info(
|
||||
"Document indexing started",
|
||||
collection=collection_name,
|
||||
indexing_id=indexing_id,
|
||||
)
|
||||
|
||||
return {
|
||||
"indexing_id": indexing_id,
|
||||
"collection": collection_name,
|
||||
"status": "indexing",
|
||||
}
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
"Failed to start indexing", collection=collection_name, error=str(e)
|
||||
)
|
||||
raise HTTPException(status_code=500, detail="Failed to start indexing")
|
||||
|
||||
|
||||
@app.get("/collections")
|
||||
async def list_collections(
|
||||
current_user: dict[str, Any] = Depends(get_current_user),
|
||||
tenant_id: str = Depends(get_tenant_id),
|
||||
):
|
||||
"""List available collections"""
|
||||
|
||||
try:
|
||||
collections_info: list[Any] = []
|
||||
|
||||
for collection_name, description in settings.collections.items():
|
||||
# Get collection info from Qdrant
|
||||
try:
|
||||
collection_info = qdrant_client.get_collection(collection_name)
|
||||
point_count = collection_info.points_count
|
||||
vector_count = collection_info.vectors_count
|
||||
except Exception:
|
||||
point_count = 0
|
||||
vector_count = 0
|
||||
|
||||
collections_info.append(
|
||||
{
|
||||
"name": collection_name,
|
||||
"description": description,
|
||||
"point_count": point_count,
|
||||
"vector_count": vector_count,
|
||||
}
|
||||
)
|
||||
|
||||
return {
|
||||
"collections": collections_info,
|
||||
"total_collections": len(collections_info),
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Failed to list collections", error=str(e))
|
||||
raise HTTPException(status_code=500, detail="Failed to list collections")
|
||||
|
||||
|
||||
async def _handle_document_extracted(topic: str, payload: EventPayload) -> None:
|
||||
"""Handle document extraction completion events"""
|
||||
try:
|
||||
data = payload.data
|
||||
doc_id = data.get("doc_id")
|
||||
tenant_id = data.get("tenant_id")
|
||||
extraction_results = data.get("extraction_results")
|
||||
|
||||
if not doc_id or not tenant_id or not extraction_results:
|
||||
logger.warning("Invalid document extraction event", data=data)
|
||||
return
|
||||
|
||||
logger.info("Auto-indexing extracted document", doc_id=doc_id)
|
||||
|
||||
# Create document for indexing
|
||||
document = {
|
||||
"doc_id": doc_id,
|
||||
"content": _extract_content_from_results(extraction_results),
|
||||
"metadata": {
|
||||
"doc_id": doc_id,
|
||||
"tenant_id": tenant_id,
|
||||
"extraction_id": extraction_results.get("extraction_id"),
|
||||
"confidence": extraction_results.get("confidence", 0.0),
|
||||
"extracted_at": extraction_results.get("extracted_at"),
|
||||
"source": "extraction",
|
||||
},
|
||||
}
|
||||
|
||||
await _index_document_async(
|
||||
collection_name="documents",
|
||||
document=document,
|
||||
tenant_id=tenant_id,
|
||||
indexing_id=str(ulid.new()),
|
||||
actor=payload.actor,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Failed to handle document extraction event", error=str(e))
|
||||
|
||||
|
||||
async def _handle_kg_upserted(topic: str, payload: EventPayload) -> None:
|
||||
"""Handle knowledge graph upsert events"""
|
||||
try:
|
||||
data = payload.data
|
||||
entities = data.get("entities", [])
|
||||
tenant_id = data.get("tenant_id")
|
||||
|
||||
if not entities or not tenant_id:
|
||||
logger.warning("Invalid KG upsert event", data=data)
|
||||
return
|
||||
|
||||
logger.info("Auto-indexing KG entities", count=len(entities))
|
||||
|
||||
# Index entities as documents
|
||||
for entity in entities:
|
||||
document = {
|
||||
"entity_id": entity.get("id"),
|
||||
"content": _extract_content_from_entity(entity),
|
||||
"metadata": {
|
||||
"entity_type": entity.get("type"),
|
||||
"entity_id": entity.get("id"),
|
||||
"tenant_id": tenant_id,
|
||||
"source": "knowledge_graph",
|
||||
},
|
||||
}
|
||||
|
||||
await _index_document_async(
|
||||
collection_name="documents",
|
||||
document=document,
|
||||
tenant_id=tenant_id,
|
||||
indexing_id=str(ulid.new()),
|
||||
actor=payload.actor,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Failed to handle KG upsert event", error=str(e))
|
||||
|
||||
|
||||
async def _index_document_async(
|
||||
collection_name: str,
|
||||
document: dict[str, Any],
|
||||
tenant_id: str,
|
||||
indexing_id: str,
|
||||
actor: str,
|
||||
):
|
||||
"""Index document asynchronously"""
|
||||
|
||||
with tracer.start_as_current_span("index_document_async") as span:
|
||||
span.set_attribute("collection_name", collection_name)
|
||||
span.set_attribute("indexing_id", indexing_id)
|
||||
span.set_attribute("tenant_id", tenant_id)
|
||||
|
||||
try:
|
||||
content = document.get("content", "")
|
||||
metadata = document.get("metadata", {})
|
||||
|
||||
# Check for PII and de-identify if needed
|
||||
if settings.require_pii_free:
|
||||
has_pii = pii_detector.has_pii(content)
|
||||
|
||||
if has_pii:
|
||||
if settings.auto_deidentify:
|
||||
content, pii_mapping = pii_detector.de_identify_text(content)
|
||||
metadata["pii_removed"] = True
|
||||
metadata["pii_mapping_hash"] = _hash_pii_mapping(pii_mapping)
|
||||
logger.info("PII removed from content", indexing_id=indexing_id)
|
||||
else:
|
||||
logger.warning(
|
||||
"Content contains PII, skipping indexing",
|
||||
indexing_id=indexing_id,
|
||||
)
|
||||
return
|
||||
|
||||
# Mark as PII-free
|
||||
metadata["pii_free"] = True
|
||||
metadata["tenant_id"] = tenant_id
|
||||
metadata["indexed_at"] = datetime.utcnow().isoformat()
|
||||
|
||||
# Chunk content
|
||||
chunks = _chunk_text(content)
|
||||
|
||||
# Generate embeddings and index chunks
|
||||
indexed_chunks = 0
|
||||
for i, chunk in enumerate(chunks):
|
||||
try:
|
||||
# Generate embedding
|
||||
embedding = await _generate_embedding(chunk)
|
||||
|
||||
# Create point
|
||||
point_id = f"{indexing_id}_{i}"
|
||||
|
||||
from qdrant_client.models import PointStruct
|
||||
|
||||
point = PointStruct(
|
||||
id=point_id,
|
||||
vector=embedding,
|
||||
payload={
|
||||
**metadata,
|
||||
"chunk_text": chunk,
|
||||
"chunk_index": i,
|
||||
"total_chunks": len(chunks),
|
||||
},
|
||||
)
|
||||
|
||||
# Index point
|
||||
success = await collection_manager.upsert_points(
|
||||
collection_name, [point]
|
||||
)
|
||||
|
||||
if success:
|
||||
indexed_chunks += 1
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Failed to index chunk", chunk_index=i, error=str(e))
|
||||
|
||||
# Update metrics
|
||||
metrics.counter("documents_indexed_total").labels(
|
||||
tenant_id=tenant_id, collection=collection_name
|
||||
).inc()
|
||||
|
||||
metrics.histogram("chunks_per_document").labels(
|
||||
collection=collection_name
|
||||
).observe(indexed_chunks)
|
||||
|
||||
# Publish completion event
|
||||
event_payload = EventPayload(
|
||||
data={
|
||||
"indexing_id": indexing_id,
|
||||
"collection": collection_name,
|
||||
"tenant_id": tenant_id,
|
||||
"chunks_indexed": indexed_chunks,
|
||||
"total_chunks": len(chunks),
|
||||
},
|
||||
actor=actor,
|
||||
tenant_id=tenant_id,
|
||||
)
|
||||
|
||||
await event_bus.publish(EventTopics.RAG_INDEXED, event_payload)
|
||||
|
||||
logger.info(
|
||||
"Document indexing completed",
|
||||
indexing_id=indexing_id,
|
||||
chunks=indexed_chunks,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
"Document indexing failed", indexing_id=indexing_id, error=str(e)
|
||||
)
|
||||
|
||||
# Update error metrics
|
||||
metrics.counter("indexing_errors_total").labels(
|
||||
tenant_id=tenant_id,
|
||||
collection=collection_name,
|
||||
error_type=type(e).__name__,
|
||||
).inc()
|
||||
|
||||
|
||||
def _extract_content_from_results(extraction_results: dict[str, Any]) -> str:
|
||||
"""Extract text content from extraction results"""
|
||||
content_parts: list[Any] = []
|
||||
|
||||
# Add extracted fields
|
||||
extracted_fields = extraction_results.get("extracted_fields", {})
|
||||
for field_name, field_value in extracted_fields.items():
|
||||
content_parts.append(f"{field_name}: {field_value}")
|
||||
|
||||
return "\n".join(content_parts)
|
||||
|
||||
|
||||
def _extract_content_from_entity(entity: dict[str, Any]) -> str:
|
||||
"""Extract text content from KG entity"""
|
||||
content_parts: list[Any] = []
|
||||
|
||||
# Add entity type and ID
|
||||
entity_type = entity.get("type", "Unknown")
|
||||
entity_id = entity.get("id", "")
|
||||
content_parts.append(f"Entity Type: {entity_type}")
|
||||
content_parts.append(f"Entity ID: {entity_id}")
|
||||
|
||||
# Add properties
|
||||
properties = entity.get("properties", {})
|
||||
for prop_name, prop_value in properties.items():
|
||||
if prop_name not in ["tenant_id", "asserted_at", "retracted_at"]:
|
||||
content_parts.append(f"{prop_name}: {prop_value}")
|
||||
|
||||
return "\n".join(content_parts)
|
||||
|
||||
|
||||
def _chunk_text(text: str) -> list[str]:
|
||||
"""Chunk text into smaller pieces"""
|
||||
if not text:
|
||||
return []
|
||||
|
||||
# Simple chunking by sentences/paragraphs
|
||||
chunks: list[Any] = []
|
||||
current_chunk = ""
|
||||
|
||||
sentences = text.split(". ")
|
||||
|
||||
for sentence in sentences:
|
||||
if len(current_chunk) + len(sentence) < settings.chunk_size:
|
||||
current_chunk += sentence + ". "
|
||||
else:
|
||||
if current_chunk:
|
||||
chunks.append(current_chunk.strip())
|
||||
current_chunk = sentence + ". "
|
||||
|
||||
if current_chunk:
|
||||
chunks.append(current_chunk.strip())
|
||||
|
||||
return chunks
|
||||
|
||||
|
||||
async def _generate_embedding(text: str) -> list[float]:
|
||||
"""Generate embedding for text"""
|
||||
if embedding_model:
|
||||
try:
|
||||
embedding = embedding_model.encode(text)
|
||||
return embedding.tolist()
|
||||
except Exception as e:
|
||||
logger.error("Failed to generate embedding", error=str(e))
|
||||
|
||||
# Fallback: random embedding
|
||||
import random
|
||||
|
||||
return [random.random() for _ in range(settings.embedding_dimension)]
|
||||
|
||||
|
||||
def _hash_pii_mapping(pii_mapping: dict[str, str]) -> str:
|
||||
"""Create hash of PII mapping for audit purposes"""
|
||||
import hashlib
|
||||
import json
|
||||
|
||||
mapping_json = json.dumps(pii_mapping, sort_keys=True)
|
||||
return hashlib.sha256(mapping_json.encode()).hexdigest()
|
||||
|
||||
|
||||
@app.exception_handler(HTTPException)
|
||||
async def http_exception_handler(request: Request, exc: HTTPException) -> JSONResponse:
|
||||
"""Handle HTTP exceptions with RFC7807 format"""
|
||||
return JSONResponse(
|
||||
status_code=exc.status_code,
|
||||
content=ErrorResponse(
|
||||
type=f"https://httpstatuses.com/{exc.status_code}",
|
||||
title=exc.detail,
|
||||
status=exc.status_code,
|
||||
detail=exc.detail,
|
||||
instance=str(request.url),
|
||||
trace_id="",
|
||||
).model_dump(),
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import uvicorn
|
||||
|
||||
uvicorn.run("main:app", host="0.0.0.0", port=8006, reload=True, log_config=None)
|
||||
19
apps/svc_rag_indexer/requirements.txt
Normal file
19
apps/svc_rag_indexer/requirements.txt
Normal file
@@ -0,0 +1,19 @@
|
||||
# Service-specific dependencies for svc_rag_indexer
|
||||
# NOTE: ML dependencies (sentence-transformers, transformers, torch, numpy) are in base-ml image
|
||||
|
||||
# Text chunking (lightweight alternative to langchain)
|
||||
tiktoken>=0.11.0
|
||||
|
||||
# Text preprocessing (lightweight)
|
||||
beautifulsoup4>=4.14.2
|
||||
|
||||
# Text similarity (CPU-only)
|
||||
faiss-cpu>=1.12.0
|
||||
|
||||
# Document processing (lightweight)
|
||||
python-docx>=1.2.0
|
||||
python-pptx>=1.0.2
|
||||
openpyxl>=3.1.5
|
||||
|
||||
# Sparse vector processing
|
||||
sparse-dot-topn>=1.1.5
|
||||
36
apps/svc_rag_retriever/Dockerfile
Normal file
36
apps/svc_rag_retriever/Dockerfile
Normal file
@@ -0,0 +1,36 @@
|
||||
# Dockerfile for svc_rag_retriever - Uses base-ml image
|
||||
# Base image contains: FastAPI, database drivers, sentence-transformers, PyTorch, etc.
|
||||
# This Dockerfile only adds service-specific dependencies and application code
|
||||
|
||||
ARG REGISTRY=gitea.harkon.co.uk
|
||||
ARG OWNER=harkon
|
||||
ARG BASE_VERSION=v1.0.1
|
||||
FROM ${REGISTRY}/${OWNER}/base-ml:${BASE_VERSION}
|
||||
|
||||
# Switch to root to install service-specific dependencies
|
||||
USER root
|
||||
|
||||
# Set working directory
|
||||
WORKDIR /app
|
||||
|
||||
# Copy service-specific requirements and install
|
||||
COPY apps/svc_rag_retriever/requirements.txt /tmp/service-requirements.txt
|
||||
RUN pip install --no-cache-dir -r /tmp/service-requirements.txt
|
||||
|
||||
# Copy application code
|
||||
COPY libs/ ./libs/
|
||||
COPY apps/svc_rag_retriever/ ./apps/svc_rag_retriever/
|
||||
|
||||
# Set permissions and switch to non-root user
|
||||
RUN chown -R appuser:appuser /app
|
||||
USER appuser
|
||||
|
||||
# Health check
|
||||
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
|
||||
CMD curl -f http://localhost:8000/healthz || exit 1
|
||||
|
||||
# Expose port
|
||||
EXPOSE 8000
|
||||
|
||||
# Run the application
|
||||
CMD ["python", "-m", "uvicorn", "apps.svc_rag_retriever.main:app", "--host", "0.0.0.0", "--port", "8000"]
|
||||
476
apps/svc_rag_retriever/main.py
Normal file
476
apps/svc_rag_retriever/main.py
Normal file
@@ -0,0 +1,476 @@
|
||||
# FILE: apps/svc-rag-retriever/main.py
|
||||
# mypy: disable-error-code=union-attr
|
||||
# Hybrid search with KG fusion, reranking, and calibrated confidence
|
||||
|
||||
import os
|
||||
|
||||
# Import shared libraries
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from typing import Any
|
||||
|
||||
import structlog
|
||||
from fastapi import Depends, HTTPException, Query, Request
|
||||
from fastapi.responses import JSONResponse
|
||||
from qdrant_client.models import SparseVector
|
||||
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
|
||||
|
||||
from libs.app_factory import create_app
|
||||
from libs.calibration import ConfidenceCalibrator
|
||||
from libs.config import (
|
||||
BaseAppSettings,
|
||||
create_event_bus,
|
||||
create_neo4j_client,
|
||||
create_qdrant_client,
|
||||
)
|
||||
from libs.events import EventBus
|
||||
from libs.neo import Neo4jClient
|
||||
from libs.observability import get_metrics, get_tracer, setup_observability
|
||||
from libs.rag import RAGRetriever
|
||||
from libs.schemas import ErrorResponse, RAGSearchRequest, RAGSearchResponse
|
||||
from libs.security import get_current_user, get_tenant_id
|
||||
|
||||
logger = structlog.get_logger()
|
||||
|
||||
|
||||
class RAGRetrieverSettings(BaseAppSettings):
|
||||
"""Settings for RAG retriever service"""
|
||||
|
||||
service_name: str = "svc-rag-retriever"
|
||||
|
||||
# Embedding configuration
|
||||
embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2"
|
||||
embedding_dimension: int = 384
|
||||
|
||||
# Search configuration
|
||||
default_k: int = 10
|
||||
max_k: int = 100
|
||||
alpha: float = 0.5 # Dense/sparse balance
|
||||
beta: float = 0.3 # Vector/KG balance
|
||||
gamma: float = 0.2 # Reranking weight
|
||||
|
||||
# Collections to search
|
||||
search_collections: list[str] = ["documents", "tax_rules", "guidance"]
|
||||
|
||||
# Reranking
|
||||
reranker_model: str | None = None
|
||||
rerank_top_k: int = 50
|
||||
|
||||
|
||||
# Create app and settings
|
||||
app, settings = create_app(
|
||||
service_name="svc-rag-retriever",
|
||||
title="Tax Agent RAG Retriever Service",
|
||||
description="Hybrid search with KG fusion and reranking",
|
||||
settings_class=RAGRetrieverSettings,
|
||||
)
|
||||
|
||||
# Global clients
|
||||
qdrant_client = None
|
||||
neo4j_client: Neo4jClient | None = None
|
||||
rag_retriever: RAGRetriever | None = None
|
||||
event_bus: EventBus | None = None
|
||||
embedding_model = None
|
||||
confidence_calibrator: ConfidenceCalibrator | None = None
|
||||
tracer = get_tracer("svc-rag-retriever")
|
||||
metrics = get_metrics()
|
||||
|
||||
|
||||
@app.on_event("startup")
|
||||
async def startup_event() -> None:
|
||||
"""Initialize service dependencies"""
|
||||
global qdrant_client, neo4j_client, rag_retriever, event_bus, embedding_model, confidence_calibrator
|
||||
|
||||
logger.info("Starting RAG retriever service")
|
||||
|
||||
# Setup observability
|
||||
setup_observability(settings)
|
||||
|
||||
# Initialize Qdrant client
|
||||
qdrant_client = create_qdrant_client(settings)
|
||||
|
||||
# Initialize Neo4j client
|
||||
neo4j_driver = create_neo4j_client(settings)
|
||||
neo4j_client = Neo4jClient(neo4j_driver)
|
||||
|
||||
# Initialize RAG retriever
|
||||
rag_retriever = RAGRetriever(
|
||||
qdrant_client=qdrant_client,
|
||||
neo4j_client=neo4j_client,
|
||||
reranker_model=settings.reranker_model,
|
||||
)
|
||||
|
||||
# Initialize embedding model
|
||||
try:
|
||||
from sentence_transformers import SentenceTransformer
|
||||
|
||||
embedding_model = SentenceTransformer(settings.embedding_model)
|
||||
logger.info("Embedding model loaded", model=settings.embedding_model)
|
||||
except ImportError:
|
||||
logger.warning("sentence-transformers not available, using mock embeddings")
|
||||
embedding_model = None
|
||||
|
||||
# Initialize confidence calibrator
|
||||
confidence_calibrator = ConfidenceCalibrator(method="isotonic")
|
||||
|
||||
# Initialize event bus
|
||||
event_bus = create_event_bus(settings)
|
||||
await event_bus.start() # fmt: skip # pyright: ignore[reportOptionalMemberAccess]
|
||||
|
||||
logger.info("RAG retriever service started successfully")
|
||||
|
||||
|
||||
@app.on_event("shutdown")
|
||||
async def shutdown_event() -> None:
|
||||
"""Cleanup service dependencies"""
|
||||
global neo4j_client, event_bus
|
||||
|
||||
logger.info("Shutting down RAG retriever service")
|
||||
|
||||
if neo4j_client:
|
||||
await neo4j_client.close()
|
||||
|
||||
if event_bus:
|
||||
await event_bus.stop()
|
||||
|
||||
logger.info("RAG retriever service shutdown complete")
|
||||
|
||||
|
||||
@app.get("/health")
|
||||
async def health_check() -> dict[str, Any]:
|
||||
"""Health check endpoint"""
|
||||
return {
|
||||
"status": "healthy",
|
||||
"service": settings.service_name,
|
||||
"version": settings.service_version,
|
||||
"timestamp": datetime.utcnow().isoformat(),
|
||||
"search_collections": settings.search_collections,
|
||||
}
|
||||
|
||||
|
||||
@app.post("/search", response_model=RAGSearchResponse)
|
||||
async def search(
|
||||
request_data: RAGSearchRequest,
|
||||
current_user: dict[str, Any] = Depends(get_current_user),
|
||||
tenant_id: str = Depends(get_tenant_id),
|
||||
) -> RAGSearchResponse:
|
||||
"""Perform hybrid RAG search"""
|
||||
|
||||
with tracer.start_as_current_span("rag_search") as span:
|
||||
span.set_attribute("query", request_data.query[:100])
|
||||
span.set_attribute("tenant_id", tenant_id)
|
||||
span.set_attribute("k", request_data.k)
|
||||
|
||||
try:
|
||||
# Generate embeddings for query
|
||||
dense_vector = await _generate_embedding(request_data.query)
|
||||
sparse_vector = await _generate_sparse_vector(request_data.query)
|
||||
|
||||
# Perform search
|
||||
search_results = await rag_retriever.search( # fmt: skip # pyright: ignore[reportOptionalMemberAccess]
|
||||
query=request_data.query,
|
||||
collections=settings.search_collections,
|
||||
dense_vector=dense_vector,
|
||||
sparse_vector=sparse_vector,
|
||||
k=request_data.k,
|
||||
alpha=settings.alpha,
|
||||
beta=settings.beta,
|
||||
gamma=settings.gamma,
|
||||
tax_year=request_data.tax_year,
|
||||
jurisdiction=request_data.jurisdiction,
|
||||
)
|
||||
|
||||
# Update metrics
|
||||
metrics.counter("searches_total").labels(tenant_id=tenant_id).inc()
|
||||
|
||||
metrics.histogram("search_results_count").labels(
|
||||
tenant_id=tenant_id
|
||||
).observe(len(search_results["chunks"]))
|
||||
|
||||
metrics.histogram("search_confidence").labels(tenant_id=tenant_id).observe(
|
||||
search_results["calibrated_confidence"]
|
||||
)
|
||||
|
||||
logger.info(
|
||||
"RAG search completed",
|
||||
query=request_data.query[:50],
|
||||
results=len(search_results["chunks"]),
|
||||
confidence=search_results["calibrated_confidence"],
|
||||
)
|
||||
|
||||
return RAGSearchResponse(
|
||||
chunks=search_results["chunks"],
|
||||
citations=search_results["citations"],
|
||||
kg_hints=search_results["kg_hints"],
|
||||
calibrated_confidence=search_results["calibrated_confidence"],
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
"RAG search failed", query=request_data.query[:50], error=str(e)
|
||||
)
|
||||
|
||||
# Update error metrics
|
||||
metrics.counter("search_errors_total").labels(
|
||||
tenant_id=tenant_id, error_type=type(e).__name__
|
||||
).inc()
|
||||
|
||||
raise HTTPException(status_code=500, detail=f"Search failed: {str(e)}")
|
||||
|
||||
|
||||
@app.get("/similar/{doc_id}")
|
||||
async def find_similar_documents(
|
||||
doc_id: str,
|
||||
k: int = Query(default=10, le=settings.max_k),
|
||||
current_user: dict[str, Any] = Depends(get_current_user),
|
||||
tenant_id: str = Depends(get_tenant_id),
|
||||
) -> dict[str, Any]:
|
||||
"""Find documents similar to given document"""
|
||||
|
||||
with tracer.start_as_current_span("find_similar") as span:
|
||||
span.set_attribute("doc_id", doc_id)
|
||||
span.set_attribute("tenant_id", tenant_id)
|
||||
span.set_attribute("k", k)
|
||||
|
||||
try:
|
||||
# Get document content from vector database
|
||||
# This would search for the document by doc_id in metadata
|
||||
from qdrant_client.models import FieldCondition, Filter, MatchValue
|
||||
|
||||
filter_conditions = Filter(
|
||||
must=[
|
||||
FieldCondition(key="doc_id", match=MatchValue(value=doc_id)),
|
||||
FieldCondition(key="tenant_id", match=MatchValue(value=tenant_id)),
|
||||
]
|
||||
)
|
||||
|
||||
# Search for the document
|
||||
doc_results = await rag_retriever.collection_manager.search_dense( # fmt: skip # pyright: ignore[reportOptionalMemberAccess]
|
||||
collection_name="documents",
|
||||
query_vector=[0.0] * settings.embedding_dimension, # Dummy vector
|
||||
limit=1,
|
||||
filter_conditions=filter_conditions,
|
||||
)
|
||||
|
||||
if not doc_results:
|
||||
raise HTTPException(status_code=404, detail="Document not found")
|
||||
|
||||
# Get the document's vector and use it for similarity search
|
||||
doc_vector = doc_results[0]["payload"].get("vector")
|
||||
if not doc_vector:
|
||||
raise HTTPException(status_code=400, detail="Document has no vector")
|
||||
|
||||
# Find similar documents
|
||||
similar_results = await rag_retriever.collection_manager.search_dense( # fmt: skip # pyright: ignore[reportOptionalMemberAccess]
|
||||
collection_name="documents",
|
||||
query_vector=doc_vector,
|
||||
limit=k + 1, # +1 to exclude the original document
|
||||
filter_conditions=Filter(
|
||||
must=[
|
||||
FieldCondition(
|
||||
key="tenant_id", match=MatchValue(value=tenant_id)
|
||||
)
|
||||
],
|
||||
must_not=[
|
||||
FieldCondition(key="doc_id", match=MatchValue(value=doc_id))
|
||||
],
|
||||
),
|
||||
)
|
||||
|
||||
return {
|
||||
"doc_id": doc_id,
|
||||
"similar_documents": similar_results[:k],
|
||||
"count": len(similar_results[:k]),
|
||||
}
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error("Similar document search failed", doc_id=doc_id, error=str(e))
|
||||
raise HTTPException(
|
||||
status_code=500, detail=f"Similar search failed: {str(e)}"
|
||||
)
|
||||
|
||||
|
||||
@app.post("/explain")
|
||||
async def explain_search(
|
||||
query: str,
|
||||
search_results: list[dict[str, Any]],
|
||||
current_user: dict[str, Any] = Depends(get_current_user),
|
||||
tenant_id: str = Depends(get_tenant_id),
|
||||
) -> dict[str, Any]:
|
||||
"""Explain search results and ranking"""
|
||||
|
||||
with tracer.start_as_current_span("explain_search") as span:
|
||||
span.set_attribute("query", query[:100])
|
||||
span.set_attribute("tenant_id", tenant_id)
|
||||
span.set_attribute("results_count", len(search_results))
|
||||
|
||||
try:
|
||||
explanations = []
|
||||
|
||||
for i, result in enumerate(search_results):
|
||||
explanation = {
|
||||
"rank": i + 1,
|
||||
"chunk_id": result.get("id"),
|
||||
"score": result.get("score", 0.0),
|
||||
"dense_score": result.get("dense_score", 0.0),
|
||||
"sparse_score": result.get("sparse_score", 0.0),
|
||||
"collection": result.get("collection"),
|
||||
"explanation": _generate_explanation(query, result),
|
||||
}
|
||||
explanations.append(explanation)
|
||||
|
||||
return {
|
||||
"query": query,
|
||||
"explanations": explanations,
|
||||
"ranking_factors": {
|
||||
"alpha": settings.alpha,
|
||||
"beta": settings.beta,
|
||||
"gamma": settings.gamma,
|
||||
},
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Search explanation failed", error=str(e))
|
||||
raise HTTPException(status_code=500, detail=f"Explanation failed: {str(e)}")
|
||||
|
||||
|
||||
async def _generate_embedding(text: str) -> list[float]:
|
||||
"""Generate dense embedding for text"""
|
||||
if embedding_model:
|
||||
try:
|
||||
embedding = embedding_model.encode(text)
|
||||
return embedding.tolist()
|
||||
except Exception as e:
|
||||
logger.error("Failed to generate embedding", error=str(e))
|
||||
|
||||
# Fallback: random embedding
|
||||
import random
|
||||
|
||||
return [random.random() for _ in range(settings.embedding_dimension)]
|
||||
|
||||
|
||||
async def _generate_sparse_vector(text: str) -> SparseVector:
|
||||
"""Generate sparse vector for text (BM25-style)"""
|
||||
try:
|
||||
# This would use a proper sparse encoder like SPLADE
|
||||
# For now, create a simple sparse representation
|
||||
from qdrant_client.models import SparseVector
|
||||
|
||||
# Simple word-based sparse vector
|
||||
words = text.lower().split()
|
||||
word_counts: dict[str, int] = {}
|
||||
for word in words:
|
||||
word_counts[word] = word_counts.get(word, 0) + 1
|
||||
|
||||
# Convert to sparse vector format
|
||||
indices = []
|
||||
values = []
|
||||
|
||||
for _i, (word, count) in enumerate(word_counts.items()):
|
||||
# Use hash of word as index
|
||||
word_hash = hash(word) % 10000 # Limit vocabulary size
|
||||
indices.append(word_hash)
|
||||
values.append(float(count))
|
||||
|
||||
return SparseVector(indices=indices, values=values)
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Failed to generate sparse vector", error=str(e))
|
||||
# Return empty sparse vector
|
||||
from qdrant_client.models import SparseVector
|
||||
|
||||
return SparseVector(indices=[], values=[])
|
||||
|
||||
|
||||
def _generate_explanation(query: str, result: dict[str, Any]) -> str:
|
||||
"""Generate human-readable explanation for search result"""
|
||||
|
||||
explanations = []
|
||||
|
||||
# Score explanation
|
||||
score = result.get("score", 0.0)
|
||||
dense_score = result.get("dense_score", 0.0)
|
||||
sparse_score = result.get("sparse_score", 0.0)
|
||||
|
||||
explanations.append(f"Overall score: {score:.3f}")
|
||||
|
||||
if dense_score > 0:
|
||||
explanations.append(f"Semantic similarity: {dense_score:.3f}")
|
||||
|
||||
if sparse_score > 0:
|
||||
explanations.append(f"Keyword match: {sparse_score:.3f}")
|
||||
|
||||
# Collection explanation
|
||||
collection = result.get("collection")
|
||||
if collection:
|
||||
explanations.append(f"Source: {collection}")
|
||||
|
||||
# Metadata explanation
|
||||
payload = result.get("payload", {})
|
||||
doc_id = payload.get("doc_id")
|
||||
if doc_id:
|
||||
explanations.append(f"Document: {doc_id}")
|
||||
|
||||
confidence = payload.get("confidence")
|
||||
if confidence:
|
||||
explanations.append(f"Extraction confidence: {confidence:.3f}")
|
||||
|
||||
return "; ".join(explanations)
|
||||
|
||||
|
||||
@app.get("/stats")
|
||||
async def get_search_stats(
|
||||
current_user: dict[str, Any] = Depends(get_current_user),
|
||||
tenant_id: str = Depends(get_tenant_id),
|
||||
) -> dict[str, Any]:
|
||||
"""Get search statistics"""
|
||||
|
||||
try:
|
||||
# This would aggregate metrics from Prometheus
|
||||
# For now, return mock stats
|
||||
stats = {
|
||||
"total_searches": 1000,
|
||||
"avg_results_per_search": 8.5,
|
||||
"avg_confidence": 0.75,
|
||||
"collections": {
|
||||
"documents": {"searches": 800, "avg_confidence": 0.78},
|
||||
"tax_rules": {"searches": 150, "avg_confidence": 0.85},
|
||||
"guidance": {"searches": 50, "avg_confidence": 0.70},
|
||||
},
|
||||
"top_queries": [
|
||||
{"query": "capital gains tax", "count": 45},
|
||||
{"query": "business expenses", "count": 38},
|
||||
{"query": "property income", "count": 32},
|
||||
],
|
||||
}
|
||||
|
||||
return stats
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Failed to get search stats", error=str(e))
|
||||
raise HTTPException(status_code=500, detail="Failed to get stats")
|
||||
|
||||
|
||||
@app.exception_handler(HTTPException)
|
||||
async def http_exception_handler(request: Request, exc: HTTPException) -> JSONResponse:
|
||||
"""Handle HTTP exceptions with RFC7807 format"""
|
||||
return JSONResponse(
|
||||
status_code=exc.status_code,
|
||||
content=ErrorResponse(
|
||||
type=f"https://httpstatuses.com/{exc.status_code}",
|
||||
title=exc.detail,
|
||||
status=exc.status_code,
|
||||
detail=exc.detail,
|
||||
instance=str(request.url),
|
||||
trace_id=getattr(request.state, "trace_id", None),
|
||||
).dict(),
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import uvicorn
|
||||
|
||||
uvicorn.run("main:app", host="0.0.0.0", port=8007, reload=True, log_config=None)
|
||||
11
apps/svc_rag_retriever/requirements.txt
Normal file
11
apps/svc_rag_retriever/requirements.txt
Normal file
@@ -0,0 +1,11 @@
|
||||
# Service-specific dependencies for svc_rag_retriever
|
||||
# NOTE: ML dependencies (sentence-transformers, transformers, torch, numpy) are in base-ml image
|
||||
|
||||
# Search and ranking (lightweight)
|
||||
rank-bm25>=0.2.2
|
||||
|
||||
# Vector similarity (CPU-only, lighter than GPU version)
|
||||
faiss-cpu>=1.12.0
|
||||
|
||||
# Sparse retrieval
|
||||
sparse-dot-topn>=1.1.5
|
||||
53
apps/svc_reason/Dockerfile
Normal file
53
apps/svc_reason/Dockerfile
Normal file
@@ -0,0 +1,53 @@
|
||||
# Multi-stage build for svc_reason
|
||||
FROM python:3.12-slim AS builder
|
||||
|
||||
# Install build dependencies
|
||||
RUN apt-get update && apt-get install -y \
|
||||
build-essential \
|
||||
curl \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Create virtual environment
|
||||
RUN python -m venv /opt/venv
|
||||
ENV PATH="/opt/venv/bin:$PATH"
|
||||
|
||||
# Copy requirements and install dependencies
|
||||
COPY libs/requirements-base.txt /tmp/libs-requirements.txt
|
||||
COPY apps/svc_reason/requirements.txt /tmp/requirements.txt
|
||||
RUN pip install --no-cache-dir --upgrade pip && \
|
||||
pip install --no-cache-dir -r /tmp/libs-requirements.txt -r /tmp/requirements.txt
|
||||
|
||||
# Production stage
|
||||
FROM python:3.12-slim
|
||||
|
||||
# Install runtime dependencies
|
||||
RUN apt-get update && apt-get install -y \
|
||||
curl \
|
||||
&& rm -rf /var/lib/apt/lists/* \
|
||||
&& groupadd -r appuser \
|
||||
&& useradd -r -g appuser appuser
|
||||
|
||||
# Copy virtual environment from builder
|
||||
COPY --from=builder /opt/venv /opt/venv
|
||||
ENV PATH="/opt/venv/bin:$PATH"
|
||||
|
||||
# Set working directory
|
||||
WORKDIR /app
|
||||
|
||||
# Copy application code
|
||||
COPY libs/ ./libs/
|
||||
COPY apps/svc_reason/ ./apps/svc_reason/
|
||||
|
||||
# Create non-root user and set permissions
|
||||
RUN chown -R appuser:appuser /app
|
||||
USER appuser
|
||||
|
||||
# Health check
|
||||
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
|
||||
CMD curl -f http://localhost:8000/healthz || exit 1
|
||||
|
||||
# Expose port
|
||||
EXPOSE 8000
|
||||
|
||||
# Run the application
|
||||
CMD ["python", "-m", "uvicorn", "apps.svc_reason.main:app", "--host", "0.0.0.0", "--port", "8000"]
|
||||
677
apps/svc_reason/main.py
Normal file
677
apps/svc_reason/main.py
Normal file
@@ -0,0 +1,677 @@
|
||||
"""Tax calculation engine with schedule computation and evidence trails."""
|
||||
|
||||
# mypy: disable-error-code=union-attr
|
||||
|
||||
# FILE: apps/svc-reason/main.py
|
||||
# pylint: disable=wrong-import-position,import-error,too-few-public-methods,global-statement
|
||||
# pylint: disable=global-variable-not-assigned,raise-missing-from,unused-argument
|
||||
# pylint: disable=broad-exception-caught,no-else-return,too-many-arguments,too-many-positional-arguments
|
||||
# pylint: disable=too-many-locals,import-outside-toplevel,too-many-statements
|
||||
|
||||
|
||||
import os
|
||||
|
||||
# Import shared libraries
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from decimal import Decimal
|
||||
from typing import Any
|
||||
|
||||
import structlog
|
||||
import ulid
|
||||
from fastapi import BackgroundTasks, Depends, HTTPException, Request
|
||||
from fastapi.responses import JSONResponse
|
||||
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
|
||||
|
||||
from libs.app_factory import create_app
|
||||
from libs.config import BaseAppSettings, create_event_bus, create_neo4j_client
|
||||
from libs.events import EventBus, EventPayload, EventTopics
|
||||
from libs.neo import Neo4jClient
|
||||
from libs.observability import get_metrics, get_tracer, setup_observability
|
||||
from libs.schemas import ErrorResponse, ScheduleComputeRequest, ScheduleComputeResponse
|
||||
from libs.security import get_current_user, get_tenant_id
|
||||
|
||||
logger = structlog.get_logger()
|
||||
|
||||
|
||||
class ReasonSettings(BaseAppSettings):
|
||||
"""Settings for reasoning service"""
|
||||
|
||||
service_name: str = "svc-reason"
|
||||
|
||||
# Tax year configuration
|
||||
current_tax_year: str = "2023-24"
|
||||
supported_tax_years: list[str] = ["2021-22", "2022-23", "2023-24", "2024-25"]
|
||||
|
||||
# Calculation configuration
|
||||
precision: int = 2 # Decimal places
|
||||
rounding_method: str = "ROUND_HALF_UP"
|
||||
|
||||
# Schedule support
|
||||
supported_schedules: list[str] = ["SA100", "SA103", "SA105", "SA106"]
|
||||
|
||||
# Validation
|
||||
max_income: float = 10000000.0 # £10M
|
||||
max_expenses: float = 10000000.0 # £10M
|
||||
|
||||
|
||||
# Create app and settings
|
||||
app, settings = create_app(
|
||||
service_name="svc-reason",
|
||||
title="Tax Agent Reasoning Service",
|
||||
description="Tax calculation engine with schedule computation",
|
||||
settings_class=ReasonSettings,
|
||||
)
|
||||
|
||||
# Global clients
|
||||
neo4j_client: Neo4jClient | None = None
|
||||
event_bus: EventBus | None = None
|
||||
tracer = get_tracer("svc-reason")
|
||||
metrics = get_metrics()
|
||||
|
||||
|
||||
@app.on_event("startup")
|
||||
async def startup_event() -> None:
|
||||
"""Initialize service dependencies"""
|
||||
global neo4j_client, event_bus
|
||||
|
||||
logger.info("Starting reasoning service")
|
||||
|
||||
# Setup observability
|
||||
setup_observability(settings)
|
||||
|
||||
# Initialize Neo4j client
|
||||
neo4j_driver = create_neo4j_client(settings)
|
||||
neo4j_client = Neo4jClient(neo4j_driver)
|
||||
|
||||
# Initialize event bus
|
||||
event_bus = create_event_bus(settings)
|
||||
await event_bus.start() # fmt: skip# pyright: ignore[reportOptionalMemberAccess]
|
||||
|
||||
# Subscribe to KG upsert events
|
||||
await event_bus.subscribe(EventTopics.KG_UPSERTED, _handle_kg_upserted) # type: ignore
|
||||
|
||||
logger.info("Reasoning service started successfully")
|
||||
|
||||
|
||||
@app.on_event("shutdown")
|
||||
async def shutdown_event() -> None:
|
||||
"""Cleanup service dependencies"""
|
||||
global neo4j_client, event_bus
|
||||
|
||||
logger.info("Shutting down reasoning service")
|
||||
|
||||
if neo4j_client:
|
||||
await neo4j_client.close()
|
||||
|
||||
if event_bus:
|
||||
await event_bus.stop()
|
||||
|
||||
logger.info("Reasoning service shutdown complete")
|
||||
|
||||
|
||||
@app.get("/health")
|
||||
async def health_check() -> dict[str, Any]:
|
||||
"""Health check endpoint"""
|
||||
return {
|
||||
"status": "healthy",
|
||||
"service": settings.service_name,
|
||||
"version": settings.service_version,
|
||||
"timestamp": datetime.utcnow().isoformat(),
|
||||
"supported_schedules": settings.supported_schedules,
|
||||
}
|
||||
|
||||
|
||||
@app.post("/compute", response_model=ScheduleComputeResponse)
|
||||
async def compute_schedule(
|
||||
request_data: ScheduleComputeRequest,
|
||||
background_tasks: BackgroundTasks,
|
||||
current_user: dict[str, Any] = Depends(get_current_user()),
|
||||
tenant_id: str = Depends(get_tenant_id()),
|
||||
) -> ScheduleComputeResponse:
|
||||
"""Compute tax schedule"""
|
||||
|
||||
with tracer.start_as_current_span("compute_schedule") as span:
|
||||
span.set_attribute("tax_year", request_data.tax_year)
|
||||
span.set_attribute("taxpayer_id", request_data.taxpayer_id)
|
||||
span.set_attribute("schedule_id", request_data.schedule_id)
|
||||
span.set_attribute("tenant_id", tenant_id)
|
||||
|
||||
try:
|
||||
# Validate inputs
|
||||
if request_data.tax_year not in settings.supported_tax_years:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"Unsupported tax year: {request_data.tax_year}",
|
||||
)
|
||||
|
||||
if request_data.schedule_id not in settings.supported_schedules:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"Unsupported schedule: {request_data.schedule_id}",
|
||||
)
|
||||
|
||||
# Generate calculation ID
|
||||
calculation_id = str(ulid.new())
|
||||
span.set_attribute("calculation_id", calculation_id)
|
||||
|
||||
# Start background computation
|
||||
background_tasks.add_task(
|
||||
_compute_schedule_async,
|
||||
request_data.tax_year,
|
||||
request_data.taxpayer_id,
|
||||
request_data.schedule_id,
|
||||
tenant_id,
|
||||
calculation_id,
|
||||
current_user.get("sub", "system"),
|
||||
)
|
||||
|
||||
logger.info(
|
||||
"Schedule computation started",
|
||||
calculation_id=calculation_id,
|
||||
schedule=request_data.schedule_id,
|
||||
)
|
||||
|
||||
return ScheduleComputeResponse(
|
||||
calculation_id=calculation_id,
|
||||
schedule=request_data.schedule_id,
|
||||
form_boxes={}, # Will be populated when computation completes
|
||||
evidence_trail=[],
|
||||
)
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error("Failed to start computation", error=str(e))
|
||||
raise HTTPException(status_code=500, detail="Failed to start computation")
|
||||
|
||||
|
||||
@app.get("/calculations/{calculation_id}")
|
||||
async def get_calculation_results(
|
||||
calculation_id: str,
|
||||
current_user: dict[str, Any] = Depends(get_current_user()),
|
||||
tenant_id: str = Depends(get_tenant_id()),
|
||||
) -> dict[str, Any]:
|
||||
"""Get calculation results"""
|
||||
|
||||
with tracer.start_as_current_span("get_calculation_results") as span:
|
||||
span.set_attribute("calculation_id", calculation_id)
|
||||
span.set_attribute("tenant_id", tenant_id)
|
||||
|
||||
try:
|
||||
# Query calculation from Neo4j
|
||||
query = """
|
||||
MATCH (c:Calculation {calculation_id: $calculation_id, tenant_id: $tenant_id})
|
||||
WHERE c.retracted_at IS NULL
|
||||
RETURN c
|
||||
"""
|
||||
|
||||
results = await neo4j_client.run_query( # pyright: ignore[reportOptionalMemberAccess]
|
||||
query, {"calculation_id": calculation_id, "tenant_id": tenant_id}
|
||||
)
|
||||
|
||||
if not results:
|
||||
raise HTTPException(status_code=404, detail="Calculation not found")
|
||||
|
||||
calculation = results[0]["c"]
|
||||
|
||||
# Get form boxes
|
||||
form_boxes_query = """
|
||||
MATCH (c:Calculation {calculation_id: $calculation_id})-[:HAS_BOX]->(b:FormBox)
|
||||
WHERE c.retracted_at IS NULL AND b.retracted_at IS NULL
|
||||
RETURN b
|
||||
"""
|
||||
|
||||
box_results = await neo4j_client.run_query( # pyright: ignore[reportOptionalMemberAccess]
|
||||
form_boxes_query, {"calculation_id": calculation_id}
|
||||
)
|
||||
|
||||
form_boxes = {}
|
||||
for box_result in box_results:
|
||||
box = box_result["b"]
|
||||
form_boxes[box["box"]] = {
|
||||
"value": box["value"],
|
||||
"description": box.get("description"),
|
||||
"confidence": box.get("confidence"),
|
||||
}
|
||||
|
||||
return {
|
||||
"calculation_id": calculation_id,
|
||||
"schedule": calculation.get("schedule"),
|
||||
"tax_year": calculation.get("tax_year"),
|
||||
"status": calculation.get("status", "completed"),
|
||||
"form_boxes": form_boxes,
|
||||
"calculated_at": calculation.get("calculated_at"),
|
||||
}
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
"Failed to get calculation results",
|
||||
calculation_id=calculation_id,
|
||||
error=str(e),
|
||||
)
|
||||
raise HTTPException(
|
||||
status_code=500, detail="Failed to get calculation results"
|
||||
)
|
||||
|
||||
|
||||
async def _handle_kg_upserted(topic: str, payload: EventPayload) -> None:
|
||||
"""Handle KG upsert events for auto-calculation"""
|
||||
try:
|
||||
data = payload.data
|
||||
entities = data.get("entities", [])
|
||||
tenant_id = data.get("tenant_id")
|
||||
|
||||
# Check if we have enough data for calculation
|
||||
has_income = any(e.get("type") == "IncomeItem" for e in entities)
|
||||
has_expenses = any(e.get("type") == "ExpenseItem" for e in entities)
|
||||
|
||||
if has_income or has_expenses:
|
||||
logger.info(
|
||||
"Auto-triggering calculation due to new financial data",
|
||||
tenant_id=tenant_id,
|
||||
)
|
||||
|
||||
# Find taxpayer ID from entities
|
||||
taxpayer_id = None
|
||||
for entity in entities:
|
||||
if entity.get("type") == "TaxpayerProfile":
|
||||
taxpayer_id = entity.get("id")
|
||||
break
|
||||
|
||||
if taxpayer_id:
|
||||
await _compute_schedule_async(
|
||||
tax_year=settings.current_tax_year,
|
||||
taxpayer_id=taxpayer_id,
|
||||
schedule_id="SA103", # Default to self-employment
|
||||
tenant_id=tenant_id or "",
|
||||
calculation_id=str(ulid.new()),
|
||||
actor=payload.actor,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Failed to handle KG upsert for auto-calculation", error=str(e))
|
||||
|
||||
|
||||
async def _compute_schedule_async(
|
||||
tax_year: str,
|
||||
taxpayer_id: str,
|
||||
schedule_id: str,
|
||||
tenant_id: str,
|
||||
calculation_id: str,
|
||||
actor: str,
|
||||
) -> None:
|
||||
"""Compute schedule asynchronously"""
|
||||
|
||||
with tracer.start_as_current_span("compute_schedule_async") as span:
|
||||
span.set_attribute("calculation_id", calculation_id)
|
||||
span.set_attribute("schedule_id", schedule_id)
|
||||
span.set_attribute("tax_year", tax_year)
|
||||
|
||||
try:
|
||||
# Get relevant data from knowledge graph
|
||||
financial_data = await _get_financial_data(taxpayer_id, tax_year, tenant_id)
|
||||
|
||||
# Perform calculations based on schedule
|
||||
if schedule_id == "SA103":
|
||||
form_boxes, evidence_trail = await _compute_sa103(
|
||||
financial_data, tax_year
|
||||
)
|
||||
elif schedule_id == "SA105":
|
||||
form_boxes, evidence_trail = await _compute_sa105(
|
||||
financial_data, tax_year
|
||||
)
|
||||
elif schedule_id == "SA100":
|
||||
form_boxes, evidence_trail = await _compute_sa100(
|
||||
financial_data, tax_year
|
||||
)
|
||||
else:
|
||||
raise ValueError(f"Unsupported schedule: {schedule_id}")
|
||||
|
||||
# Store calculation in knowledge graph
|
||||
await _store_calculation(
|
||||
calculation_id,
|
||||
schedule_id,
|
||||
tax_year,
|
||||
taxpayer_id,
|
||||
form_boxes,
|
||||
evidence_trail,
|
||||
tenant_id,
|
||||
)
|
||||
|
||||
# Update metrics
|
||||
metrics.counter("calculations_completed_total").labels(
|
||||
tenant_id=tenant_id, schedule=schedule_id, tax_year=tax_year
|
||||
).inc()
|
||||
|
||||
# Publish completion event
|
||||
event_payload = EventPayload(
|
||||
data={
|
||||
"calculation_id": calculation_id,
|
||||
"schedule": schedule_id,
|
||||
"tax_year": tax_year,
|
||||
"taxpayer_id": taxpayer_id,
|
||||
"tenant_id": tenant_id,
|
||||
"form_boxes": form_boxes,
|
||||
"box_count": len(form_boxes),
|
||||
},
|
||||
actor=actor,
|
||||
tenant_id=tenant_id,
|
||||
)
|
||||
|
||||
await event_bus.publish(EventTopics.CALC_SCHEDULE_READY, event_payload) # type: ignore
|
||||
|
||||
logger.info(
|
||||
"Schedule computation completed",
|
||||
calculation_id=calculation_id,
|
||||
schedule=schedule_id,
|
||||
boxes=len(form_boxes),
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
"Schedule computation failed",
|
||||
calculation_id=calculation_id,
|
||||
error=str(e),
|
||||
)
|
||||
|
||||
# Update error metrics
|
||||
metrics.counter("calculation_errors_total").labels(
|
||||
tenant_id=tenant_id, schedule=schedule_id, error_type=type(e).__name__
|
||||
).inc()
|
||||
|
||||
|
||||
async def _get_financial_data(
|
||||
taxpayer_id: str, tax_year: str, tenant_id: str
|
||||
) -> dict[str, Any]:
|
||||
"""Get financial data from knowledge graph"""
|
||||
|
||||
# Get income items
|
||||
income_query = """
|
||||
MATCH (t:TaxpayerProfile {taxpayer_id: $taxpayer_id, tenant_id: $tenant_id})-[:HAS_INCOME]->(i:IncomeItem)
|
||||
WHERE i.retracted_at IS NULL
|
||||
AND i.tax_year = $tax_year
|
||||
RETURN i
|
||||
"""
|
||||
|
||||
income_results = (
|
||||
await neo4j_client.run_query( # pyright: ignore[reportOptionalMemberAccess]
|
||||
income_query,
|
||||
{"taxpayer_id": taxpayer_id, "tax_year": tax_year, "tenant_id": tenant_id},
|
||||
)
|
||||
)
|
||||
|
||||
# Get expense items
|
||||
expense_query = """
|
||||
MATCH (t:TaxpayerProfile {taxpayer_id: $taxpayer_id, tenant_id: $tenant_id})-[:HAS_EXPENSE]->(e:ExpenseItem)
|
||||
WHERE e.retracted_at IS NULL
|
||||
AND e.tax_year = $tax_year
|
||||
RETURN e
|
||||
"""
|
||||
|
||||
expense_results = (
|
||||
await neo4j_client.run_query( # pyright: ignore[reportOptionalMemberAccess]
|
||||
expense_query,
|
||||
{"taxpayer_id": taxpayer_id, "tax_year": tax_year, "tenant_id": tenant_id},
|
||||
)
|
||||
)
|
||||
|
||||
return {
|
||||
"income_items": [result["i"] for result in income_results],
|
||||
"expense_items": [result["e"] for result in expense_results],
|
||||
"tax_year": tax_year,
|
||||
"taxpayer_id": taxpayer_id,
|
||||
}
|
||||
|
||||
|
||||
async def _compute_sa103(
|
||||
financial_data: dict[str, Any], tax_year: str
|
||||
) -> tuple[dict[str, Any], list[dict[str, Any]]]:
|
||||
"""Compute SA103 (Self-employment) schedule"""
|
||||
|
||||
income_items = financial_data.get("income_items", [])
|
||||
expense_items = financial_data.get("expense_items", [])
|
||||
|
||||
# Calculate totals
|
||||
total_turnover = Decimal("0")
|
||||
total_expenses = Decimal("0")
|
||||
|
||||
evidence_trail = []
|
||||
|
||||
# Sum income
|
||||
for income in income_items:
|
||||
if income.get("type") == "self_employment":
|
||||
amount = Decimal(str(income.get("gross", 0)))
|
||||
total_turnover += amount
|
||||
|
||||
evidence_trail.append(
|
||||
{
|
||||
"box": "20",
|
||||
"source_entity": income.get("income_id"),
|
||||
"amount": float(amount),
|
||||
"description": f"Income: {income.get('description', 'Unknown')}",
|
||||
}
|
||||
)
|
||||
|
||||
# Sum expenses
|
||||
for expense in expense_items:
|
||||
if expense.get("allowable", True):
|
||||
amount = Decimal(str(expense.get("amount", 0)))
|
||||
total_expenses += amount
|
||||
|
||||
evidence_trail.append(
|
||||
{
|
||||
"box": "31",
|
||||
"source_entity": expense.get("expense_id"),
|
||||
"amount": float(amount),
|
||||
"description": f"Expense: {expense.get('description', 'Unknown')}",
|
||||
}
|
||||
)
|
||||
|
||||
# Calculate net profit
|
||||
net_profit = total_turnover - total_expenses
|
||||
|
||||
# Create form boxes
|
||||
form_boxes = {
|
||||
"20": {
|
||||
"value": float(total_turnover),
|
||||
"description": "Total turnover",
|
||||
"confidence": 0.9,
|
||||
},
|
||||
"31": {
|
||||
"value": float(total_expenses),
|
||||
"description": "Total allowable business expenses",
|
||||
"confidence": 0.9,
|
||||
},
|
||||
"32": {
|
||||
"value": float(net_profit),
|
||||
"description": "Net profit",
|
||||
"confidence": 0.9,
|
||||
},
|
||||
}
|
||||
|
||||
return form_boxes, evidence_trail
|
||||
|
||||
|
||||
async def _compute_sa105(
|
||||
financial_data: dict[str, Any], tax_year: str
|
||||
) -> tuple[dict[str, Any], list[dict[str, Any]]]:
|
||||
"""Compute SA105 (Property income) schedule"""
|
||||
|
||||
income_items = financial_data.get("income_items", [])
|
||||
expense_items = financial_data.get("expense_items", [])
|
||||
|
||||
# Calculate property income and expenses
|
||||
total_rents = Decimal("0")
|
||||
total_property_expenses = Decimal("0")
|
||||
|
||||
evidence_trail = []
|
||||
|
||||
# Sum property income
|
||||
for income in income_items:
|
||||
if income.get("type") == "property":
|
||||
amount = Decimal(str(income.get("gross", 0)))
|
||||
total_rents += amount
|
||||
|
||||
evidence_trail.append(
|
||||
{
|
||||
"box": "20",
|
||||
"source_entity": income.get("income_id"),
|
||||
"amount": float(amount),
|
||||
"description": f"Property income: {income.get('description', 'Unknown')}",
|
||||
}
|
||||
)
|
||||
|
||||
# Sum property expenses
|
||||
for expense in expense_items:
|
||||
if expense.get("type") == "property" and expense.get("allowable", True):
|
||||
amount = Decimal(str(expense.get("amount", 0)))
|
||||
total_property_expenses += amount
|
||||
|
||||
# Map to appropriate SA105 box based on expense category
|
||||
box = _map_property_expense_to_box(expense.get("category", "other"))
|
||||
|
||||
evidence_trail.append(
|
||||
{
|
||||
"box": box,
|
||||
"source_entity": expense.get("expense_id"),
|
||||
"amount": float(amount),
|
||||
"description": f"Property expense: {expense.get('description', 'Unknown')}",
|
||||
}
|
||||
)
|
||||
|
||||
# Calculate net property income
|
||||
net_property_income = total_rents - total_property_expenses
|
||||
|
||||
form_boxes = {
|
||||
"20": {
|
||||
"value": float(total_rents),
|
||||
"description": "Total rents and other income",
|
||||
"confidence": 0.9,
|
||||
},
|
||||
"38": {
|
||||
"value": float(total_property_expenses),
|
||||
"description": "Total property expenses",
|
||||
"confidence": 0.9,
|
||||
},
|
||||
"net_income": {
|
||||
"value": float(net_property_income),
|
||||
"description": "Net property income",
|
||||
"confidence": 0.9,
|
||||
},
|
||||
}
|
||||
|
||||
return form_boxes, evidence_trail
|
||||
|
||||
|
||||
async def _compute_sa100(
|
||||
financial_data: dict[str, Any], tax_year: str
|
||||
) -> tuple[dict[str, Any], list[dict[str, Any]]]:
|
||||
"""Compute SA100 (Main return) schedule"""
|
||||
|
||||
# This would aggregate from other schedules
|
||||
# For now, return basic structure
|
||||
form_boxes = {
|
||||
"1": {"value": "John Doe", "description": "Your name", "confidence": 0.9}
|
||||
}
|
||||
|
||||
evidence_trail: list[dict[str, Any]] = []
|
||||
|
||||
return form_boxes, evidence_trail
|
||||
|
||||
|
||||
def _map_property_expense_to_box(category: str) -> str:
|
||||
"""Map property expense category to SA105 box"""
|
||||
mapping = {
|
||||
"rent_rates_insurance": "31",
|
||||
"property_management": "32",
|
||||
"services_wages": "33",
|
||||
"repairs_maintenance": "34",
|
||||
"finance_costs": "35",
|
||||
"professional_fees": "36",
|
||||
"costs_of_services": "37",
|
||||
"other": "38",
|
||||
}
|
||||
|
||||
return mapping.get(category, "38")
|
||||
|
||||
|
||||
async def _store_calculation(
|
||||
calculation_id: str,
|
||||
schedule: str,
|
||||
tax_year: str,
|
||||
taxpayer_id: str,
|
||||
form_boxes: dict[str, Any],
|
||||
evidence_trail: list[dict[str, Any]],
|
||||
tenant_id: str,
|
||||
) -> None:
|
||||
"""Store calculation results in knowledge graph"""
|
||||
|
||||
# Create calculation node
|
||||
calc_properties = {
|
||||
"calculation_id": calculation_id,
|
||||
"schedule": schedule,
|
||||
"tax_year": tax_year,
|
||||
"taxpayer_id": taxpayer_id,
|
||||
"tenant_id": tenant_id,
|
||||
"calculated_at": datetime.utcnow().isoformat(),
|
||||
"status": "completed",
|
||||
"source": "reasoning_engine",
|
||||
"extractor_version": "1.0.0",
|
||||
"valid_from": datetime.utcnow(),
|
||||
"asserted_at": datetime.utcnow(),
|
||||
}
|
||||
|
||||
await neo4j_client.create_node("Calculation", calc_properties) # fmt: skip # pyright: ignore[reportOptionalMemberAccess]
|
||||
|
||||
# Create form box nodes
|
||||
for box_id, box_data in form_boxes.items():
|
||||
box_properties = {
|
||||
"form": schedule,
|
||||
"box": box_id,
|
||||
"value": box_data["value"],
|
||||
"description": box_data.get("description"),
|
||||
"confidence": box_data.get("confidence"),
|
||||
"calculation_id": calculation_id,
|
||||
"tenant_id": tenant_id,
|
||||
"source": "reasoning_engine",
|
||||
"extractor_version": "1.0.0",
|
||||
"valid_from": datetime.utcnow(),
|
||||
"asserted_at": datetime.utcnow(),
|
||||
}
|
||||
|
||||
await neo4j_client.create_node("FormBox", box_properties) # fmt: skip # pyright: ignore[reportOptionalMemberAccess]
|
||||
|
||||
# Create relationship
|
||||
await neo4j_client.create_relationship( # pyright: ignore[reportOptionalMemberAccess]
|
||||
"Calculation",
|
||||
calculation_id,
|
||||
"FormBox",
|
||||
f"{calculation_id}_{box_id}",
|
||||
"HAS_BOX",
|
||||
)
|
||||
|
||||
|
||||
@app.exception_handler(HTTPException)
|
||||
async def http_exception_handler(request: Request, exc: HTTPException) -> JSONResponse:
|
||||
"""Handle HTTP exceptions with RFC7807 format"""
|
||||
return JSONResponse(
|
||||
status_code=exc.status_code,
|
||||
content=ErrorResponse(
|
||||
type=f"https://httpstatuses.com/{exc.status_code}",
|
||||
title=exc.detail,
|
||||
status=exc.status_code,
|
||||
detail=exc.detail,
|
||||
instance=str(request.url),
|
||||
trace_id=getattr(request.state, "trace_id", None),
|
||||
).model_dump(),
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import uvicorn
|
||||
|
||||
uvicorn.run("main:app", host="0.0.0.0", port=8008, reload=True, log_config=None)
|
||||
35
apps/svc_reason/requirements.txt
Normal file
35
apps/svc_reason/requirements.txt
Normal file
@@ -0,0 +1,35 @@
|
||||
# FastAPI and server
|
||||
fastapi>=0.104.1
|
||||
uvicorn[standard]>=0.24.0
|
||||
pydantic>=2.5.0
|
||||
|
||||
# Service-specific dependencies
|
||||
# Mathematical calculations
|
||||
# decimal is part of Python standard library
|
||||
sympy>=1.12.0
|
||||
|
||||
# Tax calculations
|
||||
numpy>=2.3.3
|
||||
pandas>=2.1.0
|
||||
|
||||
# Date and time calculations
|
||||
python-dateutil>=2.8.0
|
||||
pytz>=2023.3
|
||||
|
||||
# UK tax specific
|
||||
# uk-tax-calculator>=1.0.0 # Package may not exist, commenting out
|
||||
|
||||
# Business rules engine
|
||||
# python-rules>=1.3.0 # Package may not exist, commenting out
|
||||
|
||||
# Financial calculations
|
||||
# quantlib>=1.32.0 # Package may not exist, commenting out
|
||||
|
||||
# Data validation
|
||||
cerberus>=1.3.4
|
||||
|
||||
# Template processing for explanations
|
||||
jinja2>=3.1.0
|
||||
|
||||
# Statistical calculations
|
||||
scipy>=1.11.0
|
||||
53
apps/svc_rpa/Dockerfile
Normal file
53
apps/svc_rpa/Dockerfile
Normal file
@@ -0,0 +1,53 @@
|
||||
# Multi-stage build for svc_rpa
|
||||
FROM python:3.12-slim AS builder
|
||||
|
||||
# Install build dependencies
|
||||
RUN apt-get update && apt-get install -y \
|
||||
build-essential \
|
||||
curl \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Create virtual environment
|
||||
RUN python -m venv /opt/venv
|
||||
ENV PATH="/opt/venv/bin:$PATH"
|
||||
|
||||
# Copy requirements and install dependencies
|
||||
COPY libs/requirements-base.txt /tmp/libs-requirements.txt
|
||||
COPY apps/svc_rpa/requirements.txt /tmp/requirements.txt
|
||||
RUN pip install --no-cache-dir --upgrade pip && \
|
||||
pip install --no-cache-dir -r /tmp/libs-requirements.txt -r /tmp/requirements.txt
|
||||
|
||||
# Production stage
|
||||
FROM python:3.12-slim
|
||||
|
||||
# Install runtime dependencies
|
||||
RUN apt-get update && apt-get install -y \
|
||||
curl \
|
||||
&& rm -rf /var/lib/apt/lists/* \
|
||||
&& groupadd -r appuser \
|
||||
&& useradd -r -g appuser appuser
|
||||
|
||||
# Copy virtual environment from builder
|
||||
COPY --from=builder /opt/venv /opt/venv
|
||||
ENV PATH="/opt/venv/bin:$PATH"
|
||||
|
||||
# Set working directory
|
||||
WORKDIR /app
|
||||
|
||||
# Copy application code
|
||||
COPY libs/ ./libs/
|
||||
COPY apps/svc_rpa/ ./apps/svc_rpa/
|
||||
|
||||
# Create non-root user and set permissions
|
||||
RUN chown -R appuser:appuser /app
|
||||
USER appuser
|
||||
|
||||
# Health check
|
||||
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
|
||||
CMD curl -f http://localhost:8000/healthz || exit 1
|
||||
|
||||
# Expose port
|
||||
EXPOSE 8000
|
||||
|
||||
# Run the application
|
||||
CMD ["python", "-m", "uvicorn", "apps.svc_rpa.main:app", "--host", "0.0.0.0", "--port", "8000"]
|
||||
524
apps/svc_rpa/main.py
Normal file
524
apps/svc_rpa/main.py
Normal file
@@ -0,0 +1,524 @@
|
||||
# FILE: apps/svc-rpa/main.py
|
||||
# mypy: disable-error-code=union-attr
|
||||
# Playwright automation for portal data extraction (HMRC, banks, etc.)
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
|
||||
# Import shared libraries
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from typing import Any
|
||||
|
||||
import structlog
|
||||
import ulid
|
||||
from fastapi import BackgroundTasks, Depends, HTTPException, Request
|
||||
from fastapi.responses import JSONResponse
|
||||
from playwright.async_api import Browser, Page, async_playwright
|
||||
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__), "..", ".."))
|
||||
|
||||
from libs.app_factory import create_app
|
||||
from libs.config import BaseAppSettings, create_event_bus, create_vault_client
|
||||
from libs.events import EventBus, EventPayload
|
||||
from libs.observability import get_metrics, get_tracer, setup_observability
|
||||
from libs.schemas import ErrorResponse
|
||||
from libs.security import VaultTransitHelper, get_current_user, get_tenant_id
|
||||
|
||||
logger = structlog.get_logger()
|
||||
|
||||
|
||||
class RPASettings(BaseAppSettings):
|
||||
"""Settings for RPA service"""
|
||||
|
||||
service_name: str = "svc-rpa"
|
||||
|
||||
# Browser configuration
|
||||
browser_type: str = "chromium" # chromium, firefox, webkit
|
||||
headless: bool = True
|
||||
timeout: int = 30000 # 30 seconds
|
||||
|
||||
# Portal configurations
|
||||
hmrc_base_url: str = "https://www.gov.uk/log-in-hmrc-online-services"
|
||||
open_banking_enabled: bool = False
|
||||
|
||||
# Security
|
||||
max_concurrent_sessions: int = 5
|
||||
session_timeout: int = 300 # 5 minutes
|
||||
|
||||
|
||||
# Create app and settings
|
||||
app, settings = create_app(
|
||||
service_name="svc-rpa",
|
||||
title="Tax Agent RPA Service",
|
||||
description="Robotic Process Automation for portal data extraction",
|
||||
settings_class=RPASettings,
|
||||
)
|
||||
|
||||
# Global clients
|
||||
vault_helper: VaultTransitHelper | None = None
|
||||
event_bus: EventBus | None = None
|
||||
browser: Browser | None = None
|
||||
active_sessions: dict[str, dict[str, Any]] = {}
|
||||
tracer = get_tracer("svc-rpa")
|
||||
metrics = get_metrics()
|
||||
|
||||
|
||||
@app.on_event("startup")
|
||||
async def startup_event() -> None:
|
||||
"""Initialize service dependencies"""
|
||||
global vault_helper, event_bus, browser
|
||||
|
||||
logger.info("Starting RPA service")
|
||||
|
||||
# Setup observability
|
||||
setup_observability(settings)
|
||||
|
||||
# Initialize Vault helper
|
||||
vault_client = create_vault_client(settings)
|
||||
vault_helper = VaultTransitHelper(vault_client, "tax-agent-transit")
|
||||
|
||||
# Initialize event bus
|
||||
event_bus = create_event_bus(settings)
|
||||
await event_bus.start() # fmt: skip # pyright: ignore[reportOptionalMemberAccess]
|
||||
|
||||
# Initialize browser
|
||||
playwright = await async_playwright().start()
|
||||
browser = await playwright[settings.browser_type].launch(
|
||||
headless=settings.headless,
|
||||
args=["--no-sandbox", "--disable-dev-shm-usage"] if settings.headless else [],
|
||||
)
|
||||
|
||||
logger.info("RPA service started successfully")
|
||||
|
||||
|
||||
@app.on_event("shutdown")
|
||||
async def shutdown_event() -> None:
|
||||
"""Cleanup service dependencies"""
|
||||
global event_bus, browser
|
||||
|
||||
logger.info("Shutting down RPA service")
|
||||
|
||||
if browser:
|
||||
await browser.close()
|
||||
|
||||
if event_bus:
|
||||
await event_bus.stop()
|
||||
|
||||
logger.info("RPA service shutdown complete")
|
||||
|
||||
|
||||
@app.get("/health")
|
||||
async def health_check() -> dict[str, Any]:
|
||||
"""Health check endpoint"""
|
||||
return {
|
||||
"status": "healthy",
|
||||
"service": settings.service_name,
|
||||
"version": settings.service_version,
|
||||
"timestamp": datetime.utcnow().isoformat(),
|
||||
"active_sessions": len(active_sessions),
|
||||
}
|
||||
|
||||
|
||||
@app.post("/sessions")
|
||||
async def create_session(
|
||||
portal: str,
|
||||
background_tasks: BackgroundTasks,
|
||||
current_user: dict[str, Any] = Depends(get_current_user),
|
||||
tenant_id: str = Depends(get_tenant_id),
|
||||
) -> dict[str, Any]:
|
||||
"""Create new RPA session"""
|
||||
|
||||
with tracer.start_as_current_span("create_session") as span:
|
||||
span.set_attribute("portal", portal)
|
||||
span.set_attribute("tenant_id", tenant_id)
|
||||
|
||||
try:
|
||||
# Check session limits
|
||||
if len(active_sessions) >= settings.max_concurrent_sessions:
|
||||
raise HTTPException(status_code=429, detail="Too many active sessions")
|
||||
|
||||
# Generate session ID
|
||||
session_id = str(ulid.new())
|
||||
span.set_attribute("session_id", session_id)
|
||||
|
||||
# Create browser context
|
||||
context = await browser.new_context( # pyright: ignore[reportOptionalMemberAccess]
|
||||
viewport={"width": 1920, "height": 1080},
|
||||
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
|
||||
)
|
||||
|
||||
page = await context.new_page()
|
||||
|
||||
# Store session
|
||||
active_sessions[session_id] = {
|
||||
"context": context,
|
||||
"page": page,
|
||||
"portal": portal,
|
||||
"tenant_id": tenant_id,
|
||||
"user_id": current_user.get("sub"),
|
||||
"created_at": datetime.utcnow(),
|
||||
"last_activity": datetime.utcnow(),
|
||||
}
|
||||
|
||||
# Schedule session cleanup
|
||||
background_tasks.add_task(
|
||||
_cleanup_session_after_timeout, session_id, settings.session_timeout
|
||||
)
|
||||
|
||||
logger.info("RPA session created", session_id=session_id, portal=portal)
|
||||
|
||||
return {
|
||||
"session_id": session_id,
|
||||
"portal": portal,
|
||||
"status": "created",
|
||||
"expires_at": (
|
||||
datetime.utcnow().timestamp() + settings.session_timeout
|
||||
),
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Failed to create session", error=str(e))
|
||||
raise HTTPException(status_code=500, detail="Failed to create session")
|
||||
|
||||
|
||||
@app.post("/sessions/{session_id}/navigate")
|
||||
async def navigate_to_url(
|
||||
session_id: str,
|
||||
url: str,
|
||||
current_user: dict[str, Any] = Depends(get_current_user),
|
||||
tenant_id: str = Depends(get_tenant_id),
|
||||
) -> dict[str, Any]:
|
||||
"""Navigate to URL in session"""
|
||||
|
||||
with tracer.start_as_current_span("navigate") as span:
|
||||
span.set_attribute("session_id", session_id)
|
||||
span.set_attribute("url", url)
|
||||
|
||||
try:
|
||||
session = _get_session(session_id, tenant_id)
|
||||
page = session["page"]
|
||||
|
||||
# Navigate to URL
|
||||
response = await page.goto(url, timeout=settings.timeout)
|
||||
|
||||
# Update last activity
|
||||
session["last_activity"] = datetime.utcnow()
|
||||
|
||||
# Take screenshot for debugging
|
||||
await page.screenshot()
|
||||
|
||||
logger.info(
|
||||
"Navigated to URL",
|
||||
session_id=session_id,
|
||||
url=url,
|
||||
status=response.status,
|
||||
)
|
||||
|
||||
return {
|
||||
"status": "success",
|
||||
"url": page.url,
|
||||
"title": await page.title(),
|
||||
"response_status": response.status,
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
"Navigation failed", session_id=session_id, url=url, error=str(e)
|
||||
)
|
||||
raise HTTPException(status_code=500, detail=f"Navigation failed: {str(e)}")
|
||||
|
||||
|
||||
@app.post("/sessions/{session_id}/login")
|
||||
async def login_to_portal(
|
||||
session_id: str,
|
||||
credentials: dict[str, str],
|
||||
current_user: dict[str, Any] = Depends(get_current_user),
|
||||
tenant_id: str = Depends(get_tenant_id),
|
||||
) -> dict[str, Any]:
|
||||
"""Login to portal using encrypted credentials"""
|
||||
|
||||
with tracer.start_as_current_span("login") as span:
|
||||
span.set_attribute("session_id", session_id)
|
||||
|
||||
try:
|
||||
session = _get_session(session_id, tenant_id)
|
||||
page = session["page"]
|
||||
portal = session["portal"]
|
||||
|
||||
# Decrypt credentials
|
||||
decrypted_credentials: dict[str, Any] = {}
|
||||
for key, encrypted_value in credentials.items():
|
||||
decrypted_credentials[key] = (
|
||||
vault_helper.decrypt_field( # pyright: ignore[reportOptionalMemberAccess]
|
||||
key_name=key, ciphertext=encrypted_value
|
||||
)
|
||||
)
|
||||
|
||||
# Perform login based on portal type
|
||||
if portal == "hmrc":
|
||||
success = await _login_hmrc(page, decrypted_credentials)
|
||||
elif portal == "open_banking":
|
||||
success = await _login_open_banking(page, decrypted_credentials)
|
||||
else:
|
||||
raise ValueError(f"Unsupported portal: {portal}")
|
||||
|
||||
# Update session
|
||||
session["last_activity"] = datetime.utcnow()
|
||||
session["authenticated"] = success
|
||||
|
||||
if success:
|
||||
logger.info("Login successful", session_id=session_id, portal=portal)
|
||||
return {"status": "success", "authenticated": True}
|
||||
else:
|
||||
logger.warning("Login failed", session_id=session_id, portal=portal)
|
||||
return {"status": "failed", "authenticated": False}
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Login error", session_id=session_id, error=str(e))
|
||||
raise HTTPException(status_code=500, detail=f"Login failed: {str(e)}")
|
||||
|
||||
|
||||
@app.post("/sessions/{session_id}/extract")
|
||||
async def extract_data(
|
||||
session_id: str,
|
||||
extraction_config: dict[str, Any],
|
||||
current_user: dict[str, Any] = Depends(get_current_user),
|
||||
tenant_id: str = Depends(get_tenant_id),
|
||||
) -> dict[str, Any]:
|
||||
"""Extract data from portal"""
|
||||
|
||||
with tracer.start_as_current_span("extract_data") as span:
|
||||
span.set_attribute("session_id", session_id)
|
||||
|
||||
try:
|
||||
session = _get_session(session_id, tenant_id)
|
||||
page = session["page"]
|
||||
portal = session["portal"]
|
||||
|
||||
# Check authentication
|
||||
if not session.get("authenticated", False):
|
||||
raise HTTPException(status_code=401, detail="Session not authenticated")
|
||||
|
||||
# Extract data based on portal and config
|
||||
if portal == "hmrc":
|
||||
extracted_data = await _extract_hmrc_data(page, extraction_config)
|
||||
elif portal == "open_banking":
|
||||
extracted_data = await _extract_banking_data(page, extraction_config)
|
||||
else:
|
||||
raise ValueError(f"Unsupported portal: {portal}")
|
||||
|
||||
# Update session
|
||||
session["last_activity"] = datetime.utcnow()
|
||||
|
||||
# Publish extraction event
|
||||
event_payload = EventPayload(
|
||||
data={
|
||||
"session_id": session_id,
|
||||
"portal": portal,
|
||||
"extraction_config": extraction_config,
|
||||
"extracted_data": extracted_data,
|
||||
"tenant_id": tenant_id,
|
||||
},
|
||||
actor=current_user.get("sub", "system"),
|
||||
tenant_id=tenant_id,
|
||||
trace_id=span.get_span_context().trace_id,
|
||||
)
|
||||
|
||||
await event_bus.publish("rpa.data_extracted", event_payload) # fmt: skip # pyright: ignore[reportOptionalMemberAccess]
|
||||
|
||||
logger.info(
|
||||
"Data extracted",
|
||||
session_id=session_id,
|
||||
portal=portal,
|
||||
records_count=len(extracted_data.get("records", [])),
|
||||
)
|
||||
|
||||
return {
|
||||
"status": "success",
|
||||
"extracted_data": extracted_data,
|
||||
"records_count": len(extracted_data.get("records", [])),
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Data extraction failed", session_id=session_id, error=str(e))
|
||||
raise HTTPException(status_code=500, detail=f"Extraction failed: {str(e)}")
|
||||
|
||||
|
||||
@app.delete("/sessions/{session_id}")
|
||||
async def close_session(
|
||||
session_id: str,
|
||||
current_user: dict[str, Any] = Depends(get_current_user),
|
||||
tenant_id: str = Depends(get_tenant_id),
|
||||
) -> dict[str, str]:
|
||||
"""Close RPA session"""
|
||||
|
||||
with tracer.start_as_current_span("close_session") as span:
|
||||
span.set_attribute("session_id", session_id)
|
||||
|
||||
try:
|
||||
session = _get_session(session_id, tenant_id)
|
||||
|
||||
# Close browser context
|
||||
await session["context"].close()
|
||||
|
||||
# Remove from active sessions
|
||||
del active_sessions[session_id]
|
||||
|
||||
logger.info("Session closed", session_id=session_id)
|
||||
|
||||
return {"status": "closed"}
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Failed to close session", session_id=session_id, error=str(e))
|
||||
raise HTTPException(status_code=500, detail="Failed to close session")
|
||||
|
||||
|
||||
def _get_session(session_id: str, tenant_id: str) -> dict[str, Any]:
|
||||
"""Get and validate session"""
|
||||
if session_id not in active_sessions:
|
||||
raise HTTPException(status_code=404, detail="Session not found")
|
||||
|
||||
session = active_sessions[session_id]
|
||||
|
||||
# Check tenant access
|
||||
if session["tenant_id"] != tenant_id:
|
||||
raise HTTPException(status_code=403, detail="Access denied")
|
||||
|
||||
# Check timeout
|
||||
if (
|
||||
datetime.utcnow() - session["last_activity"]
|
||||
).seconds > settings.session_timeout:
|
||||
raise HTTPException(status_code=408, detail="Session expired")
|
||||
|
||||
return session
|
||||
|
||||
|
||||
async def _login_hmrc(page: Page, credentials: dict[str, str]) -> bool:
|
||||
"""Login to HMRC portal"""
|
||||
try:
|
||||
# Navigate to HMRC login
|
||||
await page.goto(settings.hmrc_base_url)
|
||||
|
||||
# Wait for login form
|
||||
await page.wait_for_selector('input[name="userId"]', timeout=settings.timeout)
|
||||
|
||||
# Fill credentials
|
||||
await page.fill('input[name="userId"]', credentials.get("user_id", ""))
|
||||
await page.fill('input[name="password"]', credentials.get("password", ""))
|
||||
|
||||
# Submit form
|
||||
await page.click('button[type="submit"]')
|
||||
|
||||
# Wait for redirect or error
|
||||
await page.wait_for_load_state("networkidle")
|
||||
|
||||
# Check if login was successful
|
||||
current_url = page.url
|
||||
return "sign-in" not in current_url.lower()
|
||||
|
||||
except Exception as e:
|
||||
logger.error("HMRC login failed", error=str(e))
|
||||
return False
|
||||
|
||||
|
||||
async def _login_open_banking(page: Page, credentials: dict[str, str]) -> bool:
|
||||
"""Login to Open Banking portal"""
|
||||
try:
|
||||
# This would implement Open Banking login flow
|
||||
# For now, return False as it's not implemented
|
||||
logger.warning("Open Banking login not implemented")
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Open Banking login failed", error=str(e))
|
||||
return False
|
||||
|
||||
|
||||
async def _extract_hmrc_data(page: Page, config: dict[str, Any]) -> dict[str, Any]:
|
||||
"""Extract data from HMRC portal"""
|
||||
try:
|
||||
data_type = config.get("data_type", "tax_returns")
|
||||
tax_year = config.get("tax_year", "2023-24")
|
||||
|
||||
extracted_data = {
|
||||
"data_type": data_type,
|
||||
"tax_year": tax_year,
|
||||
"records": [],
|
||||
"extracted_at": datetime.utcnow().isoformat(),
|
||||
}
|
||||
|
||||
if data_type == "tax_returns":
|
||||
# Navigate to tax returns section
|
||||
await page.click('a[href*="tax-return"]')
|
||||
await page.wait_for_load_state("networkidle")
|
||||
|
||||
# Extract return data
|
||||
returns = await page.query_selector_all(".tax-return-item")
|
||||
for return_element in returns:
|
||||
return_data = await return_element.evaluate(
|
||||
"""
|
||||
element => ({
|
||||
year: element.querySelector('.tax-year')?.textContent?.trim(),
|
||||
status: element.querySelector('.status')?.textContent?.trim(),
|
||||
amount: element.querySelector('.amount')?.textContent?.trim()
|
||||
})
|
||||
"""
|
||||
)
|
||||
extracted_data["records"].append(return_data)
|
||||
|
||||
return extracted_data
|
||||
|
||||
except Exception as e:
|
||||
logger.error("HMRC data extraction failed", error=str(e))
|
||||
return {"error": str(e), "records": []}
|
||||
|
||||
|
||||
async def _extract_banking_data(page: Page, config: dict[str, Any]) -> dict[str, Any]:
|
||||
"""Extract banking data via Open Banking"""
|
||||
try:
|
||||
# This would implement Open Banking data extraction
|
||||
logger.warning("Open Banking extraction not implemented")
|
||||
return {"error": "Not implemented", "records": []}
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Banking data extraction failed", error=str(e))
|
||||
return {"error": str(e), "records": []}
|
||||
|
||||
|
||||
async def _cleanup_session_after_timeout(session_id: str, timeout_seconds: int) -> None:
|
||||
"""Cleanup session after timeout"""
|
||||
await asyncio.sleep(timeout_seconds)
|
||||
|
||||
if session_id in active_sessions:
|
||||
try:
|
||||
session = active_sessions[session_id]
|
||||
await session["context"].close()
|
||||
del active_sessions[session_id]
|
||||
logger.info("Session cleaned up due to timeout", session_id=session_id)
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
"Failed to cleanup session", session_id=session_id, error=str(e)
|
||||
)
|
||||
|
||||
|
||||
@app.exception_handler(HTTPException)
|
||||
async def http_exception_handler(request: Request, exc: HTTPException) -> JSONResponse:
|
||||
"""Handle HTTP exceptions with RFC7807 format"""
|
||||
return JSONResponse(
|
||||
status_code=exc.status_code,
|
||||
content=ErrorResponse(
|
||||
type=f"https://httpstatuses.com/{exc.status_code}",
|
||||
title=exc.detail,
|
||||
status=exc.status_code,
|
||||
detail=exc.detail,
|
||||
instance=str(request.url),
|
||||
trace_id="",
|
||||
).model_dump(),
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import uvicorn
|
||||
|
||||
uvicorn.run("main:app", host="0.0.0.0", port=8001, reload=True, log_config=None)
|
||||
17
apps/svc_rpa/requirements.txt
Normal file
17
apps/svc_rpa/requirements.txt
Normal file
@@ -0,0 +1,17 @@
|
||||
# FastAPI and server
|
||||
fastapi>=0.104.1
|
||||
uvicorn[standard]>=0.24.0
|
||||
pydantic>=2.5.0
|
||||
|
||||
# Service-specific dependencies
|
||||
# Browser automation
|
||||
playwright>=1.40.0
|
||||
|
||||
# Additional async utilities
|
||||
# asyncio-timeout>=4.0.3 # Deprecated, use asyncio.timeout from Python 3.11+ standard library
|
||||
|
||||
# Session management
|
||||
aioredis>=2.0.1
|
||||
|
||||
# Browser management
|
||||
psutil>=5.9.0
|
||||
Reference in New Issue
Block a user