full ingestion -> OCR -> extraction flow is now working correctly.

2025-11-26 15:46:59 +00:00
parent fdba81809f
commit db61b05c80
17 changed files with 170 additions and 553 deletions
--- a/apps/svc_extract/main.py
+++ b/apps/svc_extract/main.py
@@ -64,28 +64,6 @@ Return a JSON object with the extracted fields and confidence scores.
 """
 # Create app and settings
 app, settings = create_app(
    service_name="svc-extract",
    title="Tax Agent Extraction Service",
    description="LLM-based field extraction service",
    settings_class=ExtractionSettings,
 )
 # Add middleware
 middleware_factory = create_trusted_proxy_middleware(settings.internal_cidrs)
 app.add_middleware(middleware_factory)
 # Global clients
 storage_client: StorageClient | None = None
 document_storage: DocumentStorage | None = None
 event_bus: EventBus | None = None
 confidence_calibrator: ConfidenceCalibrator | None = None
 tracer = get_tracer("svc-extract")
 metrics = get_metrics()
@app.on_event("startup")
 async def startup_event() -> None:
    """Initialize service dependencies"""
    global storage_client, document_storage, event_bus, confidence_calibrator
@@ -116,7 +94,6 @@ async def startup_event() -> None:
    logger.info("Extraction service started successfully")
@app.on_event("shutdown")
 async def shutdown_event() -> None:
    """Cleanup service dependencies"""
    global event_bus
@@ -129,6 +106,29 @@ async def shutdown_event() -> None:
    logger.info("Extraction service shutdown complete")
 # Create app and settings
 app, settings = create_app(
    service_name="svc-extract",
    title="Tax Agent Extraction Service",
    description="LLM-based field extraction service",
    settings_class=ExtractionSettings,
    startup_hooks=[startup_event],
    shutdown_hooks=[shutdown_event],
 )
 # Add middleware
 middleware_factory = create_trusted_proxy_middleware(settings.internal_cidrs)
 app.add_middleware(middleware_factory)
 # Global clients
 storage_client: StorageClient | None = None
 document_storage: DocumentStorage | None = None
 event_bus: EventBus | None = None
 confidence_calibrator: ConfidenceCalibrator | None = None
 tracer = get_tracer("svc-extract")
 metrics = get_metrics()
@app.post("/extract/{doc_id}", response_model=ExtractionResponse)
 async def extract_fields(
    doc_id: str,
@@ -334,13 +334,14 @@ async def _extract_fields_async(
                )
            # Update metrics
-            metrics.counter("extractions_completed_total").labels(
+            metrics.counter(
-                tenant_id=tenant_id, strategy=strategy
+                "extract_extractions_completed_total",
-            ).inc()
+                labelnames=["tenant_id", "strategy"],
            ).labels(tenant_id=tenant_id, strategy=strategy).inc()
-            metrics.histogram("extraction_confidence").labels(
+            metrics.histogram(
-                strategy=strategy
+                "extract_extraction_confidence", labelnames=["strategy"]
-            ).observe(calibrated_confidence)
+            ).labels(strategy=strategy).observe(calibrated_confidence)
            # Publish completion event
            event_payload = EventPayload(
@@ -371,7 +372,10 @@ async def _extract_fields_async(
            logger.error("Field extraction failed", doc_id=doc_id, error=str(e))
            # Update error metrics
-            metrics.counter("extraction_errors_total").labels(
+            metrics.counter(
                "extract_extraction_errors_total",
                labelnames=["tenant_id", "strategy", "error_type"],
            ).labels(
                tenant_id=tenant_id, strategy=strategy, error_type=type(e).__name__
            ).inc()
--- a/apps/svc_ingestion/main.py
+++ b/apps/svc_ingestion/main.py
@@ -77,11 +77,20 @@ def init_dependencies(app_settings: IngestionSettings) -> None:
 # Create app and settings
 async def startup_event() -> None:
    """Initialize service dependencies"""
    if event_bus is None:
        raise ValueError("Event bus not initialized")
    await event_bus.start()
 app, _settings = create_app(
    service_name="svc-ingestion",
    title="Tax Agent Ingestion Service",
    description="Document upload and storage service",
    settings_class=IngestionSettings,
    startup_hooks=[startup_event],
 )
 # Initialize dependencies immediately
@@ -158,6 +167,7 @@ async def upload_document(
            event_payload = EventPayload(
                data={
                    "doc_id": doc_id,
                    "tenant_id": tenant_id,
                    "filename": file.filename or "unknown",
                    "kind": kind.value,
                    "source": source,
--- a/apps/svc_ocr/Dockerfile
+++ b/apps/svc_ocr/Dockerfile
@@ -21,8 +21,10 @@ RUN apt-get update && apt-get install -y \
 WORKDIR /app
 # Copy service-specific requirements and install
 # Copy base requirements and service-specific requirements
 COPY libs/requirements-base.txt /tmp/libs-requirements.txt
 COPY apps/svc_ocr/requirements.txt /tmp/service-requirements.txt
-RUN pip install --no-cache-dir -r /tmp/service-requirements.txt
+RUN pip install --no-cache-dir -r /tmp/libs-requirements.txt -r /tmp/service-requirements.txt
 # Copy application code
 COPY libs/ ./libs/
--- a/apps/svc_ocr/main.py
+++ b/apps/svc_ocr/main.py
@@ -118,7 +118,7 @@ async def init_dependencies(app_settings: OCRSettings) -> None:
            if attempt == max_retries:
                raise HTTPException(
                    status_code=500, detail="Failed to connect to NATS after retries"
-                )
+                ) from e
            await asyncio.sleep(delay)
            delay *= 2  # exponential backoff
@@ -280,7 +280,7 @@ async def _handle_document_ingested(topic: str, payload: EventPayload) -> None:
        return
    # Auto-process PDF documents
-    if data.get("content_type") == "application/pdf":
+    if data.get("mime_type") == "application/pdf":
        logger.info("Auto-processing ingested document", doc_id=doc_id)
        try:
@@ -347,13 +347,13 @@ async def _process_document_async(
            await ds.store_ocr_result(tenant_id, doc_id, ocr_results)
            # Update metrics
-            metrics.counter("documents_processed_total").labels(
+            metrics.counter(
-                tenant_id=tenant_id, strategy=strategy
+                "ocr_documents_processed_total", labelnames=["tenant_id", "strategy"]
-            ).inc()
+            ).labels(tenant_id=tenant_id, strategy=strategy).inc()
-            metrics.histogram("processing_duration_seconds").labels(
+            metrics.histogram(
-                strategy=strategy
+                "ocr_processing_duration_seconds", labelnames=["strategy"]
-            ).observe(
+            ).labels(strategy=strategy).observe(
                datetime.utcnow().timestamp()
                - datetime.fromisoformat(
                    ocr_results["processed_at"].replace("Z", "")  # type: ignore
@@ -386,7 +386,10 @@ async def _process_document_async(
            logger.error("OCR processing failed", doc_id=doc_id, error=str(e))
            # Update error metrics
-            metrics.counter("processing_errors_total").labels(
+            metrics.counter(
                "ocr_processing_errors_total",
                labelnames=["tenant_id", "strategy", "error_type"],
            ).labels(
                tenant_id=tenant_id, strategy=strategy, error_type=type(e).__name__
            ).inc()
--- a/infra/authentik/bootstrap.yaml
+++ b/infra/authentik/bootstrap.yaml
@@ -50,6 +50,20 @@ entries:
      groups:
        - !Find [authentik_core.group, [name, "Administrators"]]
  # --- E2E Test User ---------------------------------------------------------
  - model: authentik_core.user
    state: present
    identifiers:
      username: e2e_tester
    attrs:
      name: "E2E Tester"
      email: e2e@example.com
      is_active: true
      password: "password123"
      groups:
        - !Find [authentik_core.group, [name, "Tax Reviewers"]]
        - !Find [authentik_core.group, [name, "Administrators"]]
  # Helper finders
  # ========= OIDC Providers + Applications ==================================
@@ -317,6 +331,37 @@ entries:
      meta_publisher: "AI Tax Agent"
      policy_engine_mode: "any"
  # --- NATS Monitoring (Proxy Provider for ForwardAuth) --------------------
  - model: authentik_providers_proxy.proxyprovider
    state: present
    identifiers:
      name: "NATS Monitoring Proxy"
    attrs:
      external_host: "https://nats.local.lan"
      internal_host: "http://apa-nats:8222"
      authorization_flow:
        !Find [authentik_flows.flow, [slug, "default-authentication-flow"]]
      invalidation_flow:
        !Find [authentik_flows.flow, [slug, "default-invalidation-flow"]]
      mode: "forward_single"
      cookie_domain: "local.lan"
  - model: authentik_core.application
    state: present
    identifiers:
      slug: "nats-monitoring"
    attrs:
      name: "NATS Monitoring"
      provider:
        !Find [
          authentik_providers_proxy.proxyprovider,
          [name, "NATS Monitoring Proxy"],
        ]
      meta_launch_url: "https://nats.local.lan"
      meta_description: "NATS Messaging System Monitoring"
      meta_publisher: "AI Tax Agent"
      policy_engine_mode: "any"
  # --- AI Tax Agent API (Proxy Provider for ForwardAuth) --------------------
  - model: authentik_providers_proxy.proxyprovider
    state: present
@@ -368,3 +413,7 @@ entries:
            authentik_providers_proxy.proxyprovider,
            [name, "AI Tax Agent API Proxy"],
          ]
        - !Find [
            authentik_providers_proxy.proxyprovider,
            [name, "NATS Monitoring Proxy"],
          ]
--- a/infra/base/infrastructure.yaml
+++ b/infra/base/infrastructure.yaml
@@ -331,6 +331,8 @@ services:
    networks:
      - backend
      - frontend
    ports:
      - "4222:4222" # Client connections (for local testing)
    volumes:
      - nats_data:/data
    command: >
--- a/infra/compose/compose.override.yaml
+++ b/infra/compose/compose.override.yaml
@@ -49,6 +49,8 @@ services:
      dockerfile: apps/svc_ingestion/Dockerfile
    image: ai-tax-agent/svc-ingestion:local
    pull_policy: never
    ports:
      - "8000:8000" # Expose for local E2E testing
  apa-svc-extract:
    build:
--- a/libs/app_factory.py
+++ b/libs/app_factory.py
@@ -2,7 +2,7 @@
 # FILE: libs/app_factory.py
-from collections.abc import AsyncIterator
+from collections.abc import AsyncIterator, Awaitable, Callable
 from contextlib import asynccontextmanager
 from typing import Any
@@ -36,6 +36,8 @@ def create_app(  # pylint: disable=too-many-arguments,too-many-positional-argume
    version: str = "1.0.0",
    settings_class: type[BaseAppSettings] = BaseAppSettings,
    custom_settings: dict[str, Any] | None = None,
    startup_hooks: list[Callable[[], Awaitable[None]]] | None = None,
    shutdown_hooks: list[Callable[[], Awaitable[None]]] | None = None,
 ) -> tuple[FastAPI, BaseAppSettings]:
    """Create a FastAPI application with standard configuration"""
@@ -56,8 +58,14 @@ def create_app(  # pylint: disable=too-many-arguments,too-many-positional-argume
    ) -> AsyncIterator[None]:  # pylint: disable=unused-argument
        # Startup
        setup_observability(settings)
        if startup_hooks:
            for hook in startup_hooks:
                await hook()
        yield
        # Shutdown
        if shutdown_hooks:
            for hook in shutdown_hooks:
                await hook()
    # Create FastAPI app
    app = FastAPI(
--- a/libs/events/topics.py
+++ b/libs/events/topics.py
@@ -4,15 +4,15 @@
 class EventTopics:  # pylint: disable=too-few-public-methods
    """Standard event topic names"""
-    DOC_INGESTED = "doc.ingested"
+    DOC_INGESTED = "doc_ingested"
-    DOC_OCR_READY = "doc.ocr_ready"
+    DOC_OCR_READY = "doc_ocr_ready"
-    DOC_EXTRACTED = "doc.extracted"
+    DOC_EXTRACTED = "doc_extracted"
-    KG_UPSERT_READY = "kg.upsert.ready"
+    KG_UPSERT_READY = "kg_upsert_ready"
-    KG_UPSERTED = "kg.upserted"
+    KG_UPSERTED = "kg_upserted"
-    RAG_INDEXED = "rag.indexed"
+    RAG_INDEXED = "rag_indexed"
-    CALC_SCHEDULE_READY = "calc.schedule_ready"
+    CALC_SCHEDULE_READY = "calc_schedule_ready"
-    FORM_FILLED = "form.filled"
+    FORM_FILLED = "form_filled"
-    HMRC_SUBMITTED = "hmrc.submitted"
+    HMRC_SUBMITTED = "hmrc_submitted"
-    REVIEW_REQUESTED = "review.requested"
+    REVIEW_REQUESTED = "review_requested"
-    REVIEW_COMPLETED = "review.completed"
+    REVIEW_COMPLETED = "review_completed"
-    FIRM_SYNC_COMPLETED = "firm.sync.completed"
+    FIRM_SYNC_COMPLETED = "firm_sync_completed"
--- a/libs/requirements-base.txt
+++ b/libs/requirements-base.txt
@@ -11,7 +11,7 @@ psycopg2-binary>=2.9.11
 neo4j>=6.0.2
 redis[hiredis]>=6.4.0
-minio>=7.2.18
+minio==7.2.18
 boto3>=1.34.0
 qdrant-client>=1.15.1
--- a/libs/schemas/events.py
+++ b/libs/schemas/events.py
@@ -72,22 +72,23 @@ class DocumentExtractedEventData(BaseEventData):
    """Event emitted when field extraction is complete."""
    doc_id: str = Field(..., description="Document identifier")
    tenant_id: str = Field(..., description="Tenant identifier")
    extraction_id: str = Field(..., description="Unique extraction run identifier")
    strategy: Literal["llm", "rules", "hybrid"] = Field(
        ..., description="Extraction strategy used"
    )
-    fields_extracted: int = Field(..., ge=0, description="Number of fields extracted")
+    field_count: int = Field(..., ge=0, description="Number of fields extracted")
-    confidence_avg: float = Field(
+    confidence: float = Field(
-        ..., ge=0.0, le=1.0, description="Average extraction confidence"
+        ..., ge=0.0, le=1.0, description="Extraction confidence score"
    )
-    calibrated_confidence: float = Field(
+    extraction_results: dict[str, Any] = Field(
-        ..., ge=0.0, le=1.0, description="Calibrated confidence score"
+        ..., description="Full extraction results including provenance"
    )
    model_name: str | None = Field(None, description="LLM model used (if applicable)")
-    processing_time_ms: int = Field(
+    processing_time_ms: int | None = Field(
-        ..., ge=0, description="Processing time in milliseconds"
+        None, ge=0, description="Processing time in milliseconds"
    )
-    storage_path: str = Field(..., description="Path to extraction results")
+    storage_path: str | None = Field(None, description="Path to extraction results")
 # Knowledge Graph events
--- a/libs/security/dependencies.py
+++ b/libs/security/dependencies.py
@@ -41,6 +41,11 @@ def get_current_tenant(request: Request) -> str | None:
        if role.startswith("tenant:"):
            return str(role.split(":", 1)[1])
    # Check for explicit tenant header (useful for testing/API keys)
    tenant_header = request.headers.get("X-Tenant-ID")
    if tenant_header:
        return tenant_header
    # Default tenant for development
    return "default"
--- a/libs/storage/client.py
+++ b/libs/storage/client.py
@@ -19,17 +19,13 @@ class StorageClient:
    async def ensure_bucket(self, bucket_name: str, region: str = "us-east-1") -> bool:
        """Ensure bucket exists, create if not"""
        try:
-            # Check if bucket exists
+            self.client.make_bucket(bucket_name=bucket_name, location=region)
            if self.client.bucket_exists(bucket_name):
                logger.debug("Bucket already exists", bucket=bucket_name)
                return True
            # Create bucket
            self.client.make_bucket(bucket_name, location=region)
            logger.info("Created bucket", bucket=bucket_name, region=region)
            return True
        except S3Error as e:
            if e.code in ("BucketAlreadyOwnedByYou", "BucketAlreadyExists"):
                logger.debug("Bucket already exists", bucket=bucket_name)
                return True
            logger.error("Failed to ensure bucket", bucket=bucket_name, error=str(e))
            return False
--- a/scripts/authentik-blueprint-import.sh
+++ b/scripts/authentik-blueprint-import.sh
@@ -1,200 +0,0 @@
 #!/bin/bash
 # Test Authentik blueprint import after manual setup
 set -euo pipefail
 # Colors for output
 RED='\033[0;31m'
 GREEN='\033[0;32m'
 YELLOW='\033[1;33m'
 BLUE='\033[0;34m'
 NC='\033[0m' # No Color
 # Configuration
 DOMAIN=${DOMAIN:-local}
 AUTHENTIK_URL="https://auth.${DOMAIN}"
 AUTHENTIK_API_URL="$AUTHENTIK_URL/api/v3"
 ADMIN_EMAIL="admin@local.local"
 ADMIN_PASSWORD="${AUTHENTIK_ADMIN_PASSWORD:-admin123}"
 echo -e "${BLUE}🧪 Testing Authentik blueprint import...${NC}"
 echo
 # Function to check if setup is complete
 check_setup_complete() {
    local host
    host=$(echo "$AUTHENTIK_URL" | sed -E 's#^https?://([^/]+).*$#\1#')
    local resolve=(--resolve "${host}:443:127.0.0.1")
    local setup_code
    setup_code=$(curl -ks "${resolve[@]}" -o /dev/null -w '%{http_code}' "$AUTHENTIK_URL/if/flow/initial-setup/" || true)
    if [[ "$setup_code" == "404" ]]; then
        return 0  # Setup is complete
    else
        return 1  # Setup is still needed
    fi
 }
 # Function to get API token via login
 get_api_token_via_login() {
    echo -e "${YELLOW}🔑 Getting API token via login...${NC}"
    local host
    host=$(echo "$AUTHENTIK_URL" | sed -E 's#^https?://([^/]+).*$#\1#')
    local resolve=(--resolve "${host}:443:127.0.0.1")
    # Get login page and extract CSRF token
    local login_page
    login_page=$(curl -ks "${resolve[@]}" -c /tmp/auth_cookies.txt "$AUTHENTIK_URL/if/flow/default-authentication-flow/" || echo "")
    if [ -z "$login_page" ]; then
        echo -e "${RED}❌ Could not access login page${NC}"
        return 1
    fi
    # Extract CSRF token from the page
    local csrf_token
    csrf_token=$(echo "$login_page" | grep -o 'name="csrfmiddlewaretoken"[^>]*value="[^"]*"' | sed 's/.*value="\([^"]*\)".*/\1/' | head -1 || echo "")
    if [ -z "$csrf_token" ]; then
        echo -e "${RED}❌ Could not extract CSRF token${NC}"
        return 1
    fi
    echo -e "${GREEN}✅ CSRF token extracted${NC}"
    # Login
    local login_response
    login_response=$(curl -ks "${resolve[@]}" -b /tmp/auth_cookies.txt -c /tmp/auth_cookies.txt \
        -X POST "$AUTHENTIK_URL/if/flow/default-authentication-flow/" \
        -H "Content-Type: application/x-www-form-urlencoded" \
        -H "Referer: $AUTHENTIK_URL/if/flow/default-authentication-flow/" \
        -d "csrfmiddlewaretoken=$csrf_token&uid_field=$ADMIN_EMAIL&password=$ADMIN_PASSWORD" \
        -w '%{http_code}' -o /tmp/login_response.html || echo "")
    if [[ "$login_response" =~ ^(200|302)$ ]]; then
        echo -e "${GREEN}✅ Login successful${NC}"
        # Get admin interface page to get new CSRF token
        local admin_page
        admin_page=$(curl -ks "${resolve[@]}" -b /tmp/auth_cookies.txt "$AUTHENTIK_URL/if/admin/" || echo "")
        local admin_csrf
        admin_csrf=$(echo "$admin_page" | grep -o 'name="csrfmiddlewaretoken"[^>]*value="[^"]*"' | sed 's/.*value="\([^"]*\)".*/\1/' | head -1 || echo "")
        if [ -n "$admin_csrf" ]; then
            # Create API token
            local token_response
            token_response=$(curl -ks "${resolve[@]}" -b /tmp/auth_cookies.txt \
                -X POST "$AUTHENTIK_API_URL/core/tokens/" \
                -H "Content-Type: application/json" \
                -H "X-CSRFToken: $admin_csrf" \
                -d "{
                    \"identifier\": \"blueprint-test-$(date +%s)\",
                    \"description\": \"Test token for blueprint import\",
                    \"expires\": \"2025-12-31T23:59:59Z\"
                }" 2>/dev/null || echo "")
            if [ -n "$token_response" ]; then
                local token
                token=$(echo "$token_response" | python3 -c "import sys, json; print(json.load(sys.stdin)['key'])" 2>/dev/null || echo "")
                if [ -n "$token" ]; then
                    echo -e "${GREEN}✅ API token created${NC}"
                    echo "$token"
                    return 0
                fi
            fi
        fi
    fi
    echo -e "${RED}❌ Failed to get API token${NC}"
    return 1
 }
 # Function to import blueprint
 import_blueprint() {
    local token="$1"
    echo -e "${YELLOW}📋 Importing blueprint...${NC}"
    local host
    host=$(echo "$AUTHENTIK_URL" | sed -E 's#^https?://([^/]+).*$#\1#')
    local resolve=(--resolve "${host}:443:127.0.0.1")
    # Create blueprint instance
    local blueprint_response
    blueprint_response=$(curl -ks "${resolve[@]}" \
        -X POST "$AUTHENTIK_API_URL/managed/blueprints/" \
        -H "Content-Type: application/json" \
        -H "Authorization: Bearer $token" \
        -d '{
            "name": "AI Tax Agent Bootstrap",
            "path": "/blueprints/bootstrap.yaml",
            "context": {},
            "enabled": true
        }' 2>/dev/null || echo "")
    echo -e "${BLUE}Blueprint creation response:${NC}"
    echo "$blueprint_response" | python3 -c "import sys, json; print(json.dumps(json.load(sys.stdin), indent=2))" 2>/dev/null || echo "$blueprint_response"
    local blueprint_pk
    blueprint_pk=$(echo "$blueprint_response" | python3 -c "import sys, json; print(json.load(sys.stdin).get('pk', ''))" 2>/dev/null || echo "")
    if [ -n "$blueprint_pk" ]; then
        echo -e "${GREEN}✅ Blueprint created with ID: $blueprint_pk${NC}"
        # Apply the blueprint
        echo -e "${YELLOW}🔄 Applying blueprint...${NC}"
        local apply_response
        apply_response=$(curl -ks "${resolve[@]}" \
            -X POST "$AUTHENTIK_API_URL/managed/blueprints/$blueprint_pk/apply/" \
            -H "Content-Type: application/json" \
            -H "Authorization: Bearer $token" \
            -d '{}' 2>/dev/null || echo "")
        echo -e "${BLUE}Blueprint apply response:${NC}"
        echo "$apply_response" | python3 -c "import sys, json; print(json.dumps(json.load(sys.stdin), indent=2))" 2>/dev/null || echo "$apply_response"
        return 0
    else
        echo -e "${RED}❌ Failed to create blueprint${NC}"
        return 1
    fi
 }
 # Main function
 main() {
    # Check if setup is complete
    if ! check_setup_complete; then
        echo -e "${YELLOW}⚠️  Initial setup is still required${NC}"
        echo -e "${BLUE}📋 Please complete setup at: https://auth.local.lan.lan/if/flow/initial-setup/${NC}"
        echo -e "${BLUE}Use credentials: admin@local.local / admin123${NC}"
        return 1
    fi
    echo -e "${GREEN}✅ Initial setup is complete${NC}"
    # Get API token
    local api_token
    if api_token=$(get_api_token_via_login); then
        echo -e "${GREEN}🔑 API token obtained${NC}"
        # Import blueprint
        if import_blueprint "$api_token"; then
            echo -e "${GREEN}🎉 Blueprint import test completed!${NC}"
        else
            echo -e "${RED}❌ Blueprint import failed${NC}"
            return 1
        fi
    else
        echo -e "${RED}❌ Could not get API token${NC}"
        return 1
    fi
    # Cleanup
    rm -f /tmp/auth_cookies.txt /tmp/login_response.html
 }
 # Run main function
 main "$@"
--- a/scripts/authentik-setup.sh
+++ b/scripts/authentik-setup.sh
@@ -1,155 +0,0 @@
 #!/bin/bash
 # Complete Authentik initial setup and get API token
 set -euo pipefail
 # Colors for output
 RED='\033[0;31m'
 GREEN='\033[0;32m'
 YELLOW='\033[1;33m'
 BLUE='\033[0;34m'
 NC='\033[0m' # No Color
 # Configuration
 DOMAIN=${DOMAIN:-local}
 AUTHENTIK_URL="https://auth.${DOMAIN}"
 ADMIN_EMAIL="admin@local"
 ADMIN_PASSWORD="${AUTHENTIK_ADMIN_PASSWORD:-admin123}"
 ENV_FILE="infra/compose/.env"
 echo -e "${BLUE}🔧 Completing Authentik initial setup...${NC}"
 echo
 # Function to update env file
 update_env_var() {
    local var_name="$1"
    local var_value="$2"
    if grep -q "^${var_name}=" "$ENV_FILE"; then
        # Update existing variable
        if [[ "$OSTYPE" == "darwin"* ]]; then
            # macOS
            sed -i '' "s|^${var_name}=.*|${var_name}=${var_value}|" "$ENV_FILE"
        else
            # Linux
            sed -i "s|^${var_name}=.*|${var_name}=${var_value}|" "$ENV_FILE"
        fi
        echo -e "${GREEN}✅ Updated ${var_name}${NC}"
    else
        # Add new variable
        echo "${var_name}=${var_value}" >> "$ENV_FILE"
        echo -e "${GREEN}✅ Added ${var_name}${NC}"
    fi
 }
 # Function to check if setup is complete
 check_setup_status() {
    local host
    host=$(echo "$AUTHENTIK_URL" | sed -E 's#^https?://([^/]+).*$#\1#')
    local resolve=(--resolve "${host}:443:127.0.0.1")
    local setup_code
    setup_code=$(curl -ks "${resolve[@]}" -o /dev/null -w '%{http_code}' "$AUTHENTIK_URL/if/flow/initial-setup/" || true)
    if [[ "$setup_code" == "404" ]]; then
        return 0  # Setup is complete
    else
        return 1  # Setup is still needed
    fi
 }
 # Function to get API token
 get_api_token() {
    echo -e "${YELLOW}🔑 Getting API token...${NC}"
    local host
    host=$(echo "$AUTHENTIK_URL" | sed -E 's#^https?://([^/]+).*$#\1#')
    local resolve=(--resolve "${host}:443:127.0.0.1")
    # Get CSRF token first
    local csrf_token
    csrf_token=$(curl -ks "${resolve[@]}" -c /tmp/authentik_cookies.txt "$AUTHENTIK_URL/if/flow/default-authentication-flow/" | grep -o 'csrfmiddlewaretoken[^>]*value="[^"]*"' | sed 's/.*value="\([^"]*\)".*/\1/' || echo "")
    if [ -z "$csrf_token" ]; then
        echo -e "${RED}❌ Could not get CSRF token${NC}"
        return 1
    fi
    # Login to get session
    local login_response
    login_response=$(curl -ks "${resolve[@]}" -b /tmp/authentik_cookies.txt -c /tmp/authentik_cookies.txt \
        -X POST "$AUTHENTIK_URL/if/flow/default-authentication-flow/" \
        -H "Content-Type: application/x-www-form-urlencoded" \
        -H "Referer: $AUTHENTIK_URL/if/flow/default-authentication-flow/" \
        -d "csrfmiddlewaretoken=$csrf_token&uid_field=$ADMIN_EMAIL&password=$ADMIN_PASSWORD" \
        -w '%{http_code}' -o /tmp/login_response.html || echo "")
    if [[ "$login_response" =~ ^(200|302)$ ]]; then
        echo -e "${GREEN}✅ Login successful${NC}"
        # Create API token
        local token_response
        token_response=$(curl -ks "${resolve[@]}" -b /tmp/authentik_cookies.txt \
            -X POST "$AUTHENTIK_URL/api/v3/core/tokens/" \
            -H "Content-Type: application/json" \
            -H "X-CSRFToken: $csrf_token" \
            -d "{
                \"identifier\": \"ai-tax-agent-bootstrap\",
                \"description\": \"Bootstrap token for AI Tax Agent setup\",
                \"expires\": \"2025-12-31T23:59:59Z\"
            }" 2>/dev/null || echo "")
        if [ -n "$token_response" ]; then
            local token
            token=$(echo "$token_response" | python3 -c "import sys, json; print(json.load(sys.stdin)['key'])" 2>/dev/null || echo "")
            if [ -n "$token" ]; then
                echo -e "${GREEN}✅ API token created${NC}"
                echo "$token"
                return 0
            fi
        fi
    fi
    echo -e "${RED}❌ Failed to get API token${NC}"
    return 1
 }
 # Main function
 main() {
    # Check if setup is already complete
    if check_setup_status; then
        echo -e "${GREEN}✅ Authentik setup is already complete${NC}"
        # Try to get API token
        local api_token
        if api_token=$(get_api_token); then
            echo -e "${GREEN}🔑 API token obtained${NC}"
            # Update .env file with token
            update_env_var "AUTHENTIK_BOOTSTRAP_TOKEN" "$api_token"
            echo
            echo -e "${GREEN}🎉 Setup complete! You can now run:${NC}"
            echo -e "  ${BLUE}make setup-authentik${NC} - to import blueprint configuration"
        else
            echo -e "${YELLOW}⚠️  Could not get API token automatically${NC}"
            echo -e "${BLUE}📋 Manual steps:${NC}"
            echo -e "  1. Open ${BLUE}https://auth.local.lan${NC} and log in"
            echo -e "  2. Go to Admin Interface > Tokens"
            echo -e "  3. Create a new token and update AUTHENTIK_BOOTSTRAP_TOKEN in .env"
        fi
    else
        echo -e "${YELLOW}📋 Initial setup still required:${NC}"
        echo -e "  1. Open ${BLUE}https://auth.local.lan.lan/if/flow/initial-setup/${NC}"
        echo -e "  2. Complete the setup wizard with these credentials:"
        echo -e "     • Email: ${BLUE}$ADMIN_EMAIL${NC}"
        echo -e "     • Password: ${BLUE}$ADMIN_PASSWORD${NC}"
        echo -e "  3. Re-run this script after setup is complete"
    fi
    # Cleanup
    rm -f /tmp/authentik_cookies.txt /tmp/login_response.html
 }
 # Run main function
 main "$@"
--- a/scripts/authentik_setup.sh
+++ b/scripts/authentik_setup.sh
@@ -1,125 +0,0 @@
 #!/bin/bash
 # Automatically complete Authentik initial setup
 set -euo pipefail
 # Colors for output
 RED='\033[0;31m'
 GREEN='\033[0;32m'
 YELLOW='\033[1;33m'
 BLUE='\033[0;34m'
 NC='\033[0m' # No Color
 # Configuration
 DOMAIN=${DOMAIN:-local}
 AUTHENTIK_URL="https://auth.${DOMAIN}"
 ADMIN_EMAIL="admin@local.lan"
 ADMIN_PASSWORD="${AUTHENTIK_ADMIN_PASSWORD:-admin123}"
 echo -e "${BLUE}🤖 Automatically completing Authentik initial setup...${NC}"
 echo
 # Function to complete initial setup
 complete_initial_setup() {
    local host
    host=$(echo "$AUTHENTIK_URL" | sed -E 's#^https?://([^/]+).*$#\1#')
    local resolve=(--resolve "${host}:443:127.0.0.1")
    echo -e "${YELLOW}📋 Completing initial setup form...${NC}"
    # Get the initial setup page and extract CSRF token
    local setup_page
    setup_page=$(curl -ks "${resolve[@]}" -c /tmp/authentik_setup_cookies.txt "$AUTHENTIK_URL/if/flow/initial-setup/" || echo "")
    if [ -z "$setup_page" ]; then
        echo -e "${RED}❌ Could not access setup page${NC}"
        return 1
    fi
    # Extract CSRF token
    local csrf_token
    csrf_token=$(echo "$setup_page" | grep -o 'csrfmiddlewaretoken[^>]*value="[^"]*"' | sed 's/.*value="\([^"]*\)".*/\1/' | head -1 || echo "")
    if [ -z "$csrf_token" ]; then
        echo -e "${RED}❌ Could not extract CSRF token${NC}"
        return 1
    fi
    echo -e "${GREEN}✅ CSRF token extracted${NC}"
    # Submit the initial setup form
    local setup_response
    setup_response=$(curl -ks "${resolve[@]}" -b /tmp/authentik_setup_cookies.txt -c /tmp/authentik_setup_cookies.txt \
        -X POST "$AUTHENTIK_URL/if/flow/initial-setup/" \
        -H "Content-Type: application/x-www-form-urlencoded" \
        -H "Referer: $AUTHENTIK_URL/if/flow/initial-setup/" \
        -d "csrfmiddlewaretoken=$csrf_token&email=$ADMIN_EMAIL&password=$ADMIN_PASSWORD&password_repeat=$ADMIN_PASSWORD" \
        -w '%{http_code}' -o /tmp/setup_response.html || echo "")
    if [[ "$setup_response" =~ ^(200|302)$ ]]; then
        echo -e "${GREEN}✅ Initial setup completed successfully${NC}"
        # Wait a moment for setup to complete
        sleep 3
        # Verify setup is complete by checking if setup page returns 404
        local verify_code
        verify_code=$(curl -ks "${resolve[@]}" -o /dev/null -w '%{http_code}' "$AUTHENTIK_URL/if/flow/initial-setup/" || true)
        if [[ "$verify_code" == "404" ]]; then
            echo -e "${GREEN}✅ Setup verification successful${NC}"
            return 0
        else
            echo -e "${YELLOW}⚠️  Setup may not be complete (verification returned $verify_code)${NC}"
            return 1
        fi
    else
        echo -e "${RED}❌ Setup failed (HTTP $setup_response)${NC}"
        return 1
    fi
 }
 # Function to check if setup is needed
 check_setup_needed() {
    local host
    host=$(echo "$AUTHENTIK_URL" | sed -E 's#^https?://([^/]+).*$#\1#')
    local resolve=(--resolve "${host}:443:127.0.0.1")
    local setup_code
    setup_code=$(curl -ks "${resolve[@]}" -o /dev/null -w '%{http_code}' "$AUTHENTIK_URL/if/flow/initial-setup/" || true)
    #TODO: this is not a valid check if setup is already complete, needs work. Authentik returns 200 even if setup is complete
    if [[ "$setup_code" == "200" ]]; then
        return 0  # Setup is needed
    else
        return 1  # Setup is not needed
    fi
 }
 # Main function
 main() {
    if check_setup_needed; then
        echo -e "${YELLOW}📋 Initial setup is required${NC}"
        if complete_initial_setup; then
            echo -e "${GREEN}🎉 Authentik initial setup completed automatically!${NC}"
            echo
            echo -e "${BLUE}📋 Next steps:${NC}"
            echo -e "  1. Run ${BLUE}make complete-authentik-setup${NC} to get API token"
            echo -e "  2. Run ${BLUE}make setup-authentik${NC} to import blueprint configuration"
            echo -e "  3. Or run ${BLUE}make setup-sso${NC} to do both automatically"
        else
            echo -e "${RED}❌ Automatic setup failed${NC}"
            echo -e "${YELLOW}📋 Manual setup required:${NC}"
            echo -e "  1. Open ${BLUE}https://auth.local.lan.lan/if/flow/initial-setup/${NC}"
            echo -e "  2. Use credentials: ${BLUE}$ADMIN_EMAIL${NC} / ${BLUE}$ADMIN_PASSWORD${NC}"
        fi
    else
        echo -e "${GREEN}✅ Authentik setup is already complete${NC}"
    fi
    # Cleanup
    rm -f /tmp/authentik_setup_cookies.txt /tmp/setup_response.html
 }
 # Run main function
 main "$@"
--- a/tests/e2e/test_backend_journey.py
+++ b/tests/e2e/test_backend_journey.py
@@ -38,14 +38,29 @@ async def test_backend_journey():
    try:
        # 2. Upload a document
-        async with httpx.AsyncClient() as client:
+        async with httpx.AsyncClient(
            verify=False
        ) as client:  # Disable SSL verification for local testing
            # Create a dummy PDF file
-            files = {"file": ("test.pdf", b"%PDF-1.4 mock content", "application/pdf")}
+            # Create a valid minimal PDF file
            pdf_content = (
                b"%PDF-1.0\n1 0 obj<</Type/Catalog/Pages 2 0 R>>endobj 2 0 obj<</Type/Pages/Kids[3 0 R]/Count 1>>endobj "
                b"3 0 obj<</Type/Page/MediaBox[0 0 3 3]/Parent 2 0 R/Resources<<>>>>endobj\nxref\n0 4\n0000000000 65535 f\n"
                b"0000000010 00000 n\n0000000060 00000 n\n0000000111 00000 n\ntrailer<</Size 4/Root 1 0 R>>\nstartxref\n190\n%%EOF"
            )
            files = {"file": ("test.pdf", pdf_content, "application/pdf")}
            response = await client.post(
                f"{INGESTION_URL}/upload",
                files=files,
                data={"kind": "invoice", "source": "e2e_test"},
-                headers={"X-Tenant-ID": TENANT_ID, "X-User-ID": "e2e_tester"},
+                headers={
                    "X-Tenant-ID": TENANT_ID,
                    "X-User-ID": "e2e_tester",
                    # Required by TrustedProxyMiddleware
                    "X-Authenticated-User": "e2e_tester",
                    "X-Authenticated-Email": "e2e@example.com",
                    "Authorization": "Bearer mock-token",
                },
            )
            assert response.status_code == 200, f"Upload failed: {response.text}"
            upload_data = response.json()