full ingestion -> OCR -> extraction flow is now working correctly.
Some checks failed
CI/CD Pipeline / Build Docker Images (svc-rpa) (push) Has been cancelled
CI/CD Pipeline / Code Quality & Linting (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Policy Validation (push) Has been cancelled
CI/CD Pipeline / Test Suite (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-firm-connectors) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-forms) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-hmrc) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ingestion) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-normalize-map) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ocr) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-indexer) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-reason) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (ui-review) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (ui-review) (push) Has been cancelled
CI/CD Pipeline / Generate SBOM (push) Has been cancelled
CI/CD Pipeline / Deploy to Staging (push) Has been cancelled
CI/CD Pipeline / Deploy to Production (push) Has been cancelled
CI/CD Pipeline / Notifications (push) Has been cancelled

This commit is contained in:
harkon
2025-11-26 15:46:59 +00:00
parent fdba81809f
commit db61b05c80
17 changed files with 170 additions and 553 deletions

View File

@@ -64,28 +64,6 @@ Return a JSON object with the extracted fields and confidence scores.
""" """
# Create app and settings
app, settings = create_app(
service_name="svc-extract",
title="Tax Agent Extraction Service",
description="LLM-based field extraction service",
settings_class=ExtractionSettings,
)
# Add middleware
middleware_factory = create_trusted_proxy_middleware(settings.internal_cidrs)
app.add_middleware(middleware_factory)
# Global clients
storage_client: StorageClient | None = None
document_storage: DocumentStorage | None = None
event_bus: EventBus | None = None
confidence_calibrator: ConfidenceCalibrator | None = None
tracer = get_tracer("svc-extract")
metrics = get_metrics()
@app.on_event("startup")
async def startup_event() -> None: async def startup_event() -> None:
"""Initialize service dependencies""" """Initialize service dependencies"""
global storage_client, document_storage, event_bus, confidence_calibrator global storage_client, document_storage, event_bus, confidence_calibrator
@@ -116,7 +94,6 @@ async def startup_event() -> None:
logger.info("Extraction service started successfully") logger.info("Extraction service started successfully")
@app.on_event("shutdown")
async def shutdown_event() -> None: async def shutdown_event() -> None:
"""Cleanup service dependencies""" """Cleanup service dependencies"""
global event_bus global event_bus
@@ -129,6 +106,29 @@ async def shutdown_event() -> None:
logger.info("Extraction service shutdown complete") logger.info("Extraction service shutdown complete")
# Create app and settings
app, settings = create_app(
service_name="svc-extract",
title="Tax Agent Extraction Service",
description="LLM-based field extraction service",
settings_class=ExtractionSettings,
startup_hooks=[startup_event],
shutdown_hooks=[shutdown_event],
)
# Add middleware
middleware_factory = create_trusted_proxy_middleware(settings.internal_cidrs)
app.add_middleware(middleware_factory)
# Global clients
storage_client: StorageClient | None = None
document_storage: DocumentStorage | None = None
event_bus: EventBus | None = None
confidence_calibrator: ConfidenceCalibrator | None = None
tracer = get_tracer("svc-extract")
metrics = get_metrics()
@app.post("/extract/{doc_id}", response_model=ExtractionResponse) @app.post("/extract/{doc_id}", response_model=ExtractionResponse)
async def extract_fields( async def extract_fields(
doc_id: str, doc_id: str,
@@ -334,13 +334,14 @@ async def _extract_fields_async(
) )
# Update metrics # Update metrics
metrics.counter("extractions_completed_total").labels( metrics.counter(
tenant_id=tenant_id, strategy=strategy "extract_extractions_completed_total",
).inc() labelnames=["tenant_id", "strategy"],
).labels(tenant_id=tenant_id, strategy=strategy).inc()
metrics.histogram("extraction_confidence").labels( metrics.histogram(
strategy=strategy "extract_extraction_confidence", labelnames=["strategy"]
).observe(calibrated_confidence) ).labels(strategy=strategy).observe(calibrated_confidence)
# Publish completion event # Publish completion event
event_payload = EventPayload( event_payload = EventPayload(
@@ -371,7 +372,10 @@ async def _extract_fields_async(
logger.error("Field extraction failed", doc_id=doc_id, error=str(e)) logger.error("Field extraction failed", doc_id=doc_id, error=str(e))
# Update error metrics # Update error metrics
metrics.counter("extraction_errors_total").labels( metrics.counter(
"extract_extraction_errors_total",
labelnames=["tenant_id", "strategy", "error_type"],
).labels(
tenant_id=tenant_id, strategy=strategy, error_type=type(e).__name__ tenant_id=tenant_id, strategy=strategy, error_type=type(e).__name__
).inc() ).inc()

View File

@@ -77,11 +77,20 @@ def init_dependencies(app_settings: IngestionSettings) -> None:
# Create app and settings # Create app and settings
async def startup_event() -> None:
"""Initialize service dependencies"""
if event_bus is None:
raise ValueError("Event bus not initialized")
await event_bus.start()
app, _settings = create_app( app, _settings = create_app(
service_name="svc-ingestion", service_name="svc-ingestion",
title="Tax Agent Ingestion Service", title="Tax Agent Ingestion Service",
description="Document upload and storage service", description="Document upload and storage service",
settings_class=IngestionSettings, settings_class=IngestionSettings,
startup_hooks=[startup_event],
) )
# Initialize dependencies immediately # Initialize dependencies immediately
@@ -158,6 +167,7 @@ async def upload_document(
event_payload = EventPayload( event_payload = EventPayload(
data={ data={
"doc_id": doc_id, "doc_id": doc_id,
"tenant_id": tenant_id,
"filename": file.filename or "unknown", "filename": file.filename or "unknown",
"kind": kind.value, "kind": kind.value,
"source": source, "source": source,

View File

@@ -21,8 +21,10 @@ RUN apt-get update && apt-get install -y \
WORKDIR /app WORKDIR /app
# Copy service-specific requirements and install # Copy service-specific requirements and install
# Copy base requirements and service-specific requirements
COPY libs/requirements-base.txt /tmp/libs-requirements.txt
COPY apps/svc_ocr/requirements.txt /tmp/service-requirements.txt COPY apps/svc_ocr/requirements.txt /tmp/service-requirements.txt
RUN pip install --no-cache-dir -r /tmp/service-requirements.txt RUN pip install --no-cache-dir -r /tmp/libs-requirements.txt -r /tmp/service-requirements.txt
# Copy application code # Copy application code
COPY libs/ ./libs/ COPY libs/ ./libs/

View File

@@ -118,7 +118,7 @@ async def init_dependencies(app_settings: OCRSettings) -> None:
if attempt == max_retries: if attempt == max_retries:
raise HTTPException( raise HTTPException(
status_code=500, detail="Failed to connect to NATS after retries" status_code=500, detail="Failed to connect to NATS after retries"
) ) from e
await asyncio.sleep(delay) await asyncio.sleep(delay)
delay *= 2 # exponential backoff delay *= 2 # exponential backoff
@@ -280,7 +280,7 @@ async def _handle_document_ingested(topic: str, payload: EventPayload) -> None:
return return
# Auto-process PDF documents # Auto-process PDF documents
if data.get("content_type") == "application/pdf": if data.get("mime_type") == "application/pdf":
logger.info("Auto-processing ingested document", doc_id=doc_id) logger.info("Auto-processing ingested document", doc_id=doc_id)
try: try:
@@ -347,13 +347,13 @@ async def _process_document_async(
await ds.store_ocr_result(tenant_id, doc_id, ocr_results) await ds.store_ocr_result(tenant_id, doc_id, ocr_results)
# Update metrics # Update metrics
metrics.counter("documents_processed_total").labels( metrics.counter(
tenant_id=tenant_id, strategy=strategy "ocr_documents_processed_total", labelnames=["tenant_id", "strategy"]
).inc() ).labels(tenant_id=tenant_id, strategy=strategy).inc()
metrics.histogram("processing_duration_seconds").labels( metrics.histogram(
strategy=strategy "ocr_processing_duration_seconds", labelnames=["strategy"]
).observe( ).labels(strategy=strategy).observe(
datetime.utcnow().timestamp() datetime.utcnow().timestamp()
- datetime.fromisoformat( - datetime.fromisoformat(
ocr_results["processed_at"].replace("Z", "") # type: ignore ocr_results["processed_at"].replace("Z", "") # type: ignore
@@ -386,7 +386,10 @@ async def _process_document_async(
logger.error("OCR processing failed", doc_id=doc_id, error=str(e)) logger.error("OCR processing failed", doc_id=doc_id, error=str(e))
# Update error metrics # Update error metrics
metrics.counter("processing_errors_total").labels( metrics.counter(
"ocr_processing_errors_total",
labelnames=["tenant_id", "strategy", "error_type"],
).labels(
tenant_id=tenant_id, strategy=strategy, error_type=type(e).__name__ tenant_id=tenant_id, strategy=strategy, error_type=type(e).__name__
).inc() ).inc()

View File

@@ -50,6 +50,20 @@ entries:
groups: groups:
- !Find [authentik_core.group, [name, "Administrators"]] - !Find [authentik_core.group, [name, "Administrators"]]
# --- E2E Test User ---------------------------------------------------------
- model: authentik_core.user
state: present
identifiers:
username: e2e_tester
attrs:
name: "E2E Tester"
email: e2e@example.com
is_active: true
password: "password123"
groups:
- !Find [authentik_core.group, [name, "Tax Reviewers"]]
- !Find [authentik_core.group, [name, "Administrators"]]
# Helper finders # Helper finders
# ========= OIDC Providers + Applications ================================== # ========= OIDC Providers + Applications ==================================
@@ -317,6 +331,37 @@ entries:
meta_publisher: "AI Tax Agent" meta_publisher: "AI Tax Agent"
policy_engine_mode: "any" policy_engine_mode: "any"
# --- NATS Monitoring (Proxy Provider for ForwardAuth) --------------------
- model: authentik_providers_proxy.proxyprovider
state: present
identifiers:
name: "NATS Monitoring Proxy"
attrs:
external_host: "https://nats.local.lan"
internal_host: "http://apa-nats:8222"
authorization_flow:
!Find [authentik_flows.flow, [slug, "default-authentication-flow"]]
invalidation_flow:
!Find [authentik_flows.flow, [slug, "default-invalidation-flow"]]
mode: "forward_single"
cookie_domain: "local.lan"
- model: authentik_core.application
state: present
identifiers:
slug: "nats-monitoring"
attrs:
name: "NATS Monitoring"
provider:
!Find [
authentik_providers_proxy.proxyprovider,
[name, "NATS Monitoring Proxy"],
]
meta_launch_url: "https://nats.local.lan"
meta_description: "NATS Messaging System Monitoring"
meta_publisher: "AI Tax Agent"
policy_engine_mode: "any"
# --- AI Tax Agent API (Proxy Provider for ForwardAuth) -------------------- # --- AI Tax Agent API (Proxy Provider for ForwardAuth) --------------------
- model: authentik_providers_proxy.proxyprovider - model: authentik_providers_proxy.proxyprovider
state: present state: present
@@ -368,3 +413,7 @@ entries:
authentik_providers_proxy.proxyprovider, authentik_providers_proxy.proxyprovider,
[name, "AI Tax Agent API Proxy"], [name, "AI Tax Agent API Proxy"],
] ]
- !Find [
authentik_providers_proxy.proxyprovider,
[name, "NATS Monitoring Proxy"],
]

View File

@@ -331,6 +331,8 @@ services:
networks: networks:
- backend - backend
- frontend - frontend
ports:
- "4222:4222" # Client connections (for local testing)
volumes: volumes:
- nats_data:/data - nats_data:/data
command: > command: >

View File

@@ -49,6 +49,8 @@ services:
dockerfile: apps/svc_ingestion/Dockerfile dockerfile: apps/svc_ingestion/Dockerfile
image: ai-tax-agent/svc-ingestion:local image: ai-tax-agent/svc-ingestion:local
pull_policy: never pull_policy: never
ports:
- "8000:8000" # Expose for local E2E testing
apa-svc-extract: apa-svc-extract:
build: build:

View File

@@ -2,7 +2,7 @@
# FILE: libs/app_factory.py # FILE: libs/app_factory.py
from collections.abc import AsyncIterator from collections.abc import AsyncIterator, Awaitable, Callable
from contextlib import asynccontextmanager from contextlib import asynccontextmanager
from typing import Any from typing import Any
@@ -36,6 +36,8 @@ def create_app( # pylint: disable=too-many-arguments,too-many-positional-argume
version: str = "1.0.0", version: str = "1.0.0",
settings_class: type[BaseAppSettings] = BaseAppSettings, settings_class: type[BaseAppSettings] = BaseAppSettings,
custom_settings: dict[str, Any] | None = None, custom_settings: dict[str, Any] | None = None,
startup_hooks: list[Callable[[], Awaitable[None]]] | None = None,
shutdown_hooks: list[Callable[[], Awaitable[None]]] | None = None,
) -> tuple[FastAPI, BaseAppSettings]: ) -> tuple[FastAPI, BaseAppSettings]:
"""Create a FastAPI application with standard configuration""" """Create a FastAPI application with standard configuration"""
@@ -56,8 +58,14 @@ def create_app( # pylint: disable=too-many-arguments,too-many-positional-argume
) -> AsyncIterator[None]: # pylint: disable=unused-argument ) -> AsyncIterator[None]: # pylint: disable=unused-argument
# Startup # Startup
setup_observability(settings) setup_observability(settings)
if startup_hooks:
for hook in startup_hooks:
await hook()
yield yield
# Shutdown # Shutdown
if shutdown_hooks:
for hook in shutdown_hooks:
await hook()
# Create FastAPI app # Create FastAPI app
app = FastAPI( app = FastAPI(

View File

@@ -4,15 +4,15 @@
class EventTopics: # pylint: disable=too-few-public-methods class EventTopics: # pylint: disable=too-few-public-methods
"""Standard event topic names""" """Standard event topic names"""
DOC_INGESTED = "doc.ingested" DOC_INGESTED = "doc_ingested"
DOC_OCR_READY = "doc.ocr_ready" DOC_OCR_READY = "doc_ocr_ready"
DOC_EXTRACTED = "doc.extracted" DOC_EXTRACTED = "doc_extracted"
KG_UPSERT_READY = "kg.upsert.ready" KG_UPSERT_READY = "kg_upsert_ready"
KG_UPSERTED = "kg.upserted" KG_UPSERTED = "kg_upserted"
RAG_INDEXED = "rag.indexed" RAG_INDEXED = "rag_indexed"
CALC_SCHEDULE_READY = "calc.schedule_ready" CALC_SCHEDULE_READY = "calc_schedule_ready"
FORM_FILLED = "form.filled" FORM_FILLED = "form_filled"
HMRC_SUBMITTED = "hmrc.submitted" HMRC_SUBMITTED = "hmrc_submitted"
REVIEW_REQUESTED = "review.requested" REVIEW_REQUESTED = "review_requested"
REVIEW_COMPLETED = "review.completed" REVIEW_COMPLETED = "review_completed"
FIRM_SYNC_COMPLETED = "firm.sync.completed" FIRM_SYNC_COMPLETED = "firm_sync_completed"

View File

@@ -11,7 +11,7 @@ psycopg2-binary>=2.9.11
neo4j>=6.0.2 neo4j>=6.0.2
redis[hiredis]>=6.4.0 redis[hiredis]>=6.4.0
minio>=7.2.18 minio==7.2.18
boto3>=1.34.0 boto3>=1.34.0
qdrant-client>=1.15.1 qdrant-client>=1.15.1

View File

@@ -72,22 +72,23 @@ class DocumentExtractedEventData(BaseEventData):
"""Event emitted when field extraction is complete.""" """Event emitted when field extraction is complete."""
doc_id: str = Field(..., description="Document identifier") doc_id: str = Field(..., description="Document identifier")
tenant_id: str = Field(..., description="Tenant identifier")
extraction_id: str = Field(..., description="Unique extraction run identifier") extraction_id: str = Field(..., description="Unique extraction run identifier")
strategy: Literal["llm", "rules", "hybrid"] = Field( strategy: Literal["llm", "rules", "hybrid"] = Field(
..., description="Extraction strategy used" ..., description="Extraction strategy used"
) )
fields_extracted: int = Field(..., ge=0, description="Number of fields extracted") field_count: int = Field(..., ge=0, description="Number of fields extracted")
confidence_avg: float = Field( confidence: float = Field(
..., ge=0.0, le=1.0, description="Average extraction confidence" ..., ge=0.0, le=1.0, description="Extraction confidence score"
) )
calibrated_confidence: float = Field( extraction_results: dict[str, Any] = Field(
..., ge=0.0, le=1.0, description="Calibrated confidence score" ..., description="Full extraction results including provenance"
) )
model_name: str | None = Field(None, description="LLM model used (if applicable)") model_name: str | None = Field(None, description="LLM model used (if applicable)")
processing_time_ms: int = Field( processing_time_ms: int | None = Field(
..., ge=0, description="Processing time in milliseconds" None, ge=0, description="Processing time in milliseconds"
) )
storage_path: str = Field(..., description="Path to extraction results") storage_path: str | None = Field(None, description="Path to extraction results")
# Knowledge Graph events # Knowledge Graph events

View File

@@ -41,6 +41,11 @@ def get_current_tenant(request: Request) -> str | None:
if role.startswith("tenant:"): if role.startswith("tenant:"):
return str(role.split(":", 1)[1]) return str(role.split(":", 1)[1])
# Check for explicit tenant header (useful for testing/API keys)
tenant_header = request.headers.get("X-Tenant-ID")
if tenant_header:
return tenant_header
# Default tenant for development # Default tenant for development
return "default" return "default"

View File

@@ -19,17 +19,13 @@ class StorageClient:
async def ensure_bucket(self, bucket_name: str, region: str = "us-east-1") -> bool: async def ensure_bucket(self, bucket_name: str, region: str = "us-east-1") -> bool:
"""Ensure bucket exists, create if not""" """Ensure bucket exists, create if not"""
try: try:
# Check if bucket exists self.client.make_bucket(bucket_name=bucket_name, location=region)
if self.client.bucket_exists(bucket_name):
logger.debug("Bucket already exists", bucket=bucket_name)
return True
# Create bucket
self.client.make_bucket(bucket_name, location=region)
logger.info("Created bucket", bucket=bucket_name, region=region) logger.info("Created bucket", bucket=bucket_name, region=region)
return True return True
except S3Error as e: except S3Error as e:
if e.code in ("BucketAlreadyOwnedByYou", "BucketAlreadyExists"):
logger.debug("Bucket already exists", bucket=bucket_name)
return True
logger.error("Failed to ensure bucket", bucket=bucket_name, error=str(e)) logger.error("Failed to ensure bucket", bucket=bucket_name, error=str(e))
return False return False

View File

@@ -1,200 +0,0 @@
#!/bin/bash
# Test Authentik blueprint import after manual setup
set -euo pipefail
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
# Configuration
DOMAIN=${DOMAIN:-local}
AUTHENTIK_URL="https://auth.${DOMAIN}"
AUTHENTIK_API_URL="$AUTHENTIK_URL/api/v3"
ADMIN_EMAIL="admin@local.local"
ADMIN_PASSWORD="${AUTHENTIK_ADMIN_PASSWORD:-admin123}"
echo -e "${BLUE}🧪 Testing Authentik blueprint import...${NC}"
echo
# Function to check if setup is complete
check_setup_complete() {
local host
host=$(echo "$AUTHENTIK_URL" | sed -E 's#^https?://([^/]+).*$#\1#')
local resolve=(--resolve "${host}:443:127.0.0.1")
local setup_code
setup_code=$(curl -ks "${resolve[@]}" -o /dev/null -w '%{http_code}' "$AUTHENTIK_URL/if/flow/initial-setup/" || true)
if [[ "$setup_code" == "404" ]]; then
return 0 # Setup is complete
else
return 1 # Setup is still needed
fi
}
# Function to get API token via login
get_api_token_via_login() {
echo -e "${YELLOW}🔑 Getting API token via login...${NC}"
local host
host=$(echo "$AUTHENTIK_URL" | sed -E 's#^https?://([^/]+).*$#\1#')
local resolve=(--resolve "${host}:443:127.0.0.1")
# Get login page and extract CSRF token
local login_page
login_page=$(curl -ks "${resolve[@]}" -c /tmp/auth_cookies.txt "$AUTHENTIK_URL/if/flow/default-authentication-flow/" || echo "")
if [ -z "$login_page" ]; then
echo -e "${RED}❌ Could not access login page${NC}"
return 1
fi
# Extract CSRF token from the page
local csrf_token
csrf_token=$(echo "$login_page" | grep -o 'name="csrfmiddlewaretoken"[^>]*value="[^"]*"' | sed 's/.*value="\([^"]*\)".*/\1/' | head -1 || echo "")
if [ -z "$csrf_token" ]; then
echo -e "${RED}❌ Could not extract CSRF token${NC}"
return 1
fi
echo -e "${GREEN}✅ CSRF token extracted${NC}"
# Login
local login_response
login_response=$(curl -ks "${resolve[@]}" -b /tmp/auth_cookies.txt -c /tmp/auth_cookies.txt \
-X POST "$AUTHENTIK_URL/if/flow/default-authentication-flow/" \
-H "Content-Type: application/x-www-form-urlencoded" \
-H "Referer: $AUTHENTIK_URL/if/flow/default-authentication-flow/" \
-d "csrfmiddlewaretoken=$csrf_token&uid_field=$ADMIN_EMAIL&password=$ADMIN_PASSWORD" \
-w '%{http_code}' -o /tmp/login_response.html || echo "")
if [[ "$login_response" =~ ^(200|302)$ ]]; then
echo -e "${GREEN}✅ Login successful${NC}"
# Get admin interface page to get new CSRF token
local admin_page
admin_page=$(curl -ks "${resolve[@]}" -b /tmp/auth_cookies.txt "$AUTHENTIK_URL/if/admin/" || echo "")
local admin_csrf
admin_csrf=$(echo "$admin_page" | grep -o 'name="csrfmiddlewaretoken"[^>]*value="[^"]*"' | sed 's/.*value="\([^"]*\)".*/\1/' | head -1 || echo "")
if [ -n "$admin_csrf" ]; then
# Create API token
local token_response
token_response=$(curl -ks "${resolve[@]}" -b /tmp/auth_cookies.txt \
-X POST "$AUTHENTIK_API_URL/core/tokens/" \
-H "Content-Type: application/json" \
-H "X-CSRFToken: $admin_csrf" \
-d "{
\"identifier\": \"blueprint-test-$(date +%s)\",
\"description\": \"Test token for blueprint import\",
\"expires\": \"2025-12-31T23:59:59Z\"
}" 2>/dev/null || echo "")
if [ -n "$token_response" ]; then
local token
token=$(echo "$token_response" | python3 -c "import sys, json; print(json.load(sys.stdin)['key'])" 2>/dev/null || echo "")
if [ -n "$token" ]; then
echo -e "${GREEN}✅ API token created${NC}"
echo "$token"
return 0
fi
fi
fi
fi
echo -e "${RED}❌ Failed to get API token${NC}"
return 1
}
# Function to import blueprint
import_blueprint() {
local token="$1"
echo -e "${YELLOW}📋 Importing blueprint...${NC}"
local host
host=$(echo "$AUTHENTIK_URL" | sed -E 's#^https?://([^/]+).*$#\1#')
local resolve=(--resolve "${host}:443:127.0.0.1")
# Create blueprint instance
local blueprint_response
blueprint_response=$(curl -ks "${resolve[@]}" \
-X POST "$AUTHENTIK_API_URL/managed/blueprints/" \
-H "Content-Type: application/json" \
-H "Authorization: Bearer $token" \
-d '{
"name": "AI Tax Agent Bootstrap",
"path": "/blueprints/bootstrap.yaml",
"context": {},
"enabled": true
}' 2>/dev/null || echo "")
echo -e "${BLUE}Blueprint creation response:${NC}"
echo "$blueprint_response" | python3 -c "import sys, json; print(json.dumps(json.load(sys.stdin), indent=2))" 2>/dev/null || echo "$blueprint_response"
local blueprint_pk
blueprint_pk=$(echo "$blueprint_response" | python3 -c "import sys, json; print(json.load(sys.stdin).get('pk', ''))" 2>/dev/null || echo "")
if [ -n "$blueprint_pk" ]; then
echo -e "${GREEN}✅ Blueprint created with ID: $blueprint_pk${NC}"
# Apply the blueprint
echo -e "${YELLOW}🔄 Applying blueprint...${NC}"
local apply_response
apply_response=$(curl -ks "${resolve[@]}" \
-X POST "$AUTHENTIK_API_URL/managed/blueprints/$blueprint_pk/apply/" \
-H "Content-Type: application/json" \
-H "Authorization: Bearer $token" \
-d '{}' 2>/dev/null || echo "")
echo -e "${BLUE}Blueprint apply response:${NC}"
echo "$apply_response" | python3 -c "import sys, json; print(json.dumps(json.load(sys.stdin), indent=2))" 2>/dev/null || echo "$apply_response"
return 0
else
echo -e "${RED}❌ Failed to create blueprint${NC}"
return 1
fi
}
# Main function
main() {
# Check if setup is complete
if ! check_setup_complete; then
echo -e "${YELLOW}⚠️ Initial setup is still required${NC}"
echo -e "${BLUE}📋 Please complete setup at: https://auth.local.lan.lan/if/flow/initial-setup/${NC}"
echo -e "${BLUE}Use credentials: admin@local.local / admin123${NC}"
return 1
fi
echo -e "${GREEN}✅ Initial setup is complete${NC}"
# Get API token
local api_token
if api_token=$(get_api_token_via_login); then
echo -e "${GREEN}🔑 API token obtained${NC}"
# Import blueprint
if import_blueprint "$api_token"; then
echo -e "${GREEN}🎉 Blueprint import test completed!${NC}"
else
echo -e "${RED}❌ Blueprint import failed${NC}"
return 1
fi
else
echo -e "${RED}❌ Could not get API token${NC}"
return 1
fi
# Cleanup
rm -f /tmp/auth_cookies.txt /tmp/login_response.html
}
# Run main function
main "$@"

View File

@@ -1,155 +0,0 @@
#!/bin/bash
# Complete Authentik initial setup and get API token
set -euo pipefail
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
# Configuration
DOMAIN=${DOMAIN:-local}
AUTHENTIK_URL="https://auth.${DOMAIN}"
ADMIN_EMAIL="admin@local"
ADMIN_PASSWORD="${AUTHENTIK_ADMIN_PASSWORD:-admin123}"
ENV_FILE="infra/compose/.env"
echo -e "${BLUE}🔧 Completing Authentik initial setup...${NC}"
echo
# Function to update env file
update_env_var() {
local var_name="$1"
local var_value="$2"
if grep -q "^${var_name}=" "$ENV_FILE"; then
# Update existing variable
if [[ "$OSTYPE" == "darwin"* ]]; then
# macOS
sed -i '' "s|^${var_name}=.*|${var_name}=${var_value}|" "$ENV_FILE"
else
# Linux
sed -i "s|^${var_name}=.*|${var_name}=${var_value}|" "$ENV_FILE"
fi
echo -e "${GREEN}✅ Updated ${var_name}${NC}"
else
# Add new variable
echo "${var_name}=${var_value}" >> "$ENV_FILE"
echo -e "${GREEN}✅ Added ${var_name}${NC}"
fi
}
# Function to check if setup is complete
check_setup_status() {
local host
host=$(echo "$AUTHENTIK_URL" | sed -E 's#^https?://([^/]+).*$#\1#')
local resolve=(--resolve "${host}:443:127.0.0.1")
local setup_code
setup_code=$(curl -ks "${resolve[@]}" -o /dev/null -w '%{http_code}' "$AUTHENTIK_URL/if/flow/initial-setup/" || true)
if [[ "$setup_code" == "404" ]]; then
return 0 # Setup is complete
else
return 1 # Setup is still needed
fi
}
# Function to get API token
get_api_token() {
echo -e "${YELLOW}🔑 Getting API token...${NC}"
local host
host=$(echo "$AUTHENTIK_URL" | sed -E 's#^https?://([^/]+).*$#\1#')
local resolve=(--resolve "${host}:443:127.0.0.1")
# Get CSRF token first
local csrf_token
csrf_token=$(curl -ks "${resolve[@]}" -c /tmp/authentik_cookies.txt "$AUTHENTIK_URL/if/flow/default-authentication-flow/" | grep -o 'csrfmiddlewaretoken[^>]*value="[^"]*"' | sed 's/.*value="\([^"]*\)".*/\1/' || echo "")
if [ -z "$csrf_token" ]; then
echo -e "${RED}❌ Could not get CSRF token${NC}"
return 1
fi
# Login to get session
local login_response
login_response=$(curl -ks "${resolve[@]}" -b /tmp/authentik_cookies.txt -c /tmp/authentik_cookies.txt \
-X POST "$AUTHENTIK_URL/if/flow/default-authentication-flow/" \
-H "Content-Type: application/x-www-form-urlencoded" \
-H "Referer: $AUTHENTIK_URL/if/flow/default-authentication-flow/" \
-d "csrfmiddlewaretoken=$csrf_token&uid_field=$ADMIN_EMAIL&password=$ADMIN_PASSWORD" \
-w '%{http_code}' -o /tmp/login_response.html || echo "")
if [[ "$login_response" =~ ^(200|302)$ ]]; then
echo -e "${GREEN}✅ Login successful${NC}"
# Create API token
local token_response
token_response=$(curl -ks "${resolve[@]}" -b /tmp/authentik_cookies.txt \
-X POST "$AUTHENTIK_URL/api/v3/core/tokens/" \
-H "Content-Type: application/json" \
-H "X-CSRFToken: $csrf_token" \
-d "{
\"identifier\": \"ai-tax-agent-bootstrap\",
\"description\": \"Bootstrap token for AI Tax Agent setup\",
\"expires\": \"2025-12-31T23:59:59Z\"
}" 2>/dev/null || echo "")
if [ -n "$token_response" ]; then
local token
token=$(echo "$token_response" | python3 -c "import sys, json; print(json.load(sys.stdin)['key'])" 2>/dev/null || echo "")
if [ -n "$token" ]; then
echo -e "${GREEN}✅ API token created${NC}"
echo "$token"
return 0
fi
fi
fi
echo -e "${RED}❌ Failed to get API token${NC}"
return 1
}
# Main function
main() {
# Check if setup is already complete
if check_setup_status; then
echo -e "${GREEN}✅ Authentik setup is already complete${NC}"
# Try to get API token
local api_token
if api_token=$(get_api_token); then
echo -e "${GREEN}🔑 API token obtained${NC}"
# Update .env file with token
update_env_var "AUTHENTIK_BOOTSTRAP_TOKEN" "$api_token"
echo
echo -e "${GREEN}🎉 Setup complete! You can now run:${NC}"
echo -e " ${BLUE}make setup-authentik${NC} - to import blueprint configuration"
else
echo -e "${YELLOW}⚠️ Could not get API token automatically${NC}"
echo -e "${BLUE}📋 Manual steps:${NC}"
echo -e " 1. Open ${BLUE}https://auth.local.lan${NC} and log in"
echo -e " 2. Go to Admin Interface > Tokens"
echo -e " 3. Create a new token and update AUTHENTIK_BOOTSTRAP_TOKEN in .env"
fi
else
echo -e "${YELLOW}📋 Initial setup still required:${NC}"
echo -e " 1. Open ${BLUE}https://auth.local.lan.lan/if/flow/initial-setup/${NC}"
echo -e " 2. Complete the setup wizard with these credentials:"
echo -e " • Email: ${BLUE}$ADMIN_EMAIL${NC}"
echo -e " • Password: ${BLUE}$ADMIN_PASSWORD${NC}"
echo -e " 3. Re-run this script after setup is complete"
fi
# Cleanup
rm -f /tmp/authentik_cookies.txt /tmp/login_response.html
}
# Run main function
main "$@"

View File

@@ -1,125 +0,0 @@
#!/bin/bash
# Automatically complete Authentik initial setup
set -euo pipefail
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
# Configuration
DOMAIN=${DOMAIN:-local}
AUTHENTIK_URL="https://auth.${DOMAIN}"
ADMIN_EMAIL="admin@local.lan"
ADMIN_PASSWORD="${AUTHENTIK_ADMIN_PASSWORD:-admin123}"
echo -e "${BLUE}🤖 Automatically completing Authentik initial setup...${NC}"
echo
# Function to complete initial setup
complete_initial_setup() {
local host
host=$(echo "$AUTHENTIK_URL" | sed -E 's#^https?://([^/]+).*$#\1#')
local resolve=(--resolve "${host}:443:127.0.0.1")
echo -e "${YELLOW}📋 Completing initial setup form...${NC}"
# Get the initial setup page and extract CSRF token
local setup_page
setup_page=$(curl -ks "${resolve[@]}" -c /tmp/authentik_setup_cookies.txt "$AUTHENTIK_URL/if/flow/initial-setup/" || echo "")
if [ -z "$setup_page" ]; then
echo -e "${RED}❌ Could not access setup page${NC}"
return 1
fi
# Extract CSRF token
local csrf_token
csrf_token=$(echo "$setup_page" | grep -o 'csrfmiddlewaretoken[^>]*value="[^"]*"' | sed 's/.*value="\([^"]*\)".*/\1/' | head -1 || echo "")
if [ -z "$csrf_token" ]; then
echo -e "${RED}❌ Could not extract CSRF token${NC}"
return 1
fi
echo -e "${GREEN}✅ CSRF token extracted${NC}"
# Submit the initial setup form
local setup_response
setup_response=$(curl -ks "${resolve[@]}" -b /tmp/authentik_setup_cookies.txt -c /tmp/authentik_setup_cookies.txt \
-X POST "$AUTHENTIK_URL/if/flow/initial-setup/" \
-H "Content-Type: application/x-www-form-urlencoded" \
-H "Referer: $AUTHENTIK_URL/if/flow/initial-setup/" \
-d "csrfmiddlewaretoken=$csrf_token&email=$ADMIN_EMAIL&password=$ADMIN_PASSWORD&password_repeat=$ADMIN_PASSWORD" \
-w '%{http_code}' -o /tmp/setup_response.html || echo "")
if [[ "$setup_response" =~ ^(200|302)$ ]]; then
echo -e "${GREEN}✅ Initial setup completed successfully${NC}"
# Wait a moment for setup to complete
sleep 3
# Verify setup is complete by checking if setup page returns 404
local verify_code
verify_code=$(curl -ks "${resolve[@]}" -o /dev/null -w '%{http_code}' "$AUTHENTIK_URL/if/flow/initial-setup/" || true)
if [[ "$verify_code" == "404" ]]; then
echo -e "${GREEN}✅ Setup verification successful${NC}"
return 0
else
echo -e "${YELLOW}⚠️ Setup may not be complete (verification returned $verify_code)${NC}"
return 1
fi
else
echo -e "${RED}❌ Setup failed (HTTP $setup_response)${NC}"
return 1
fi
}
# Function to check if setup is needed
check_setup_needed() {
local host
host=$(echo "$AUTHENTIK_URL" | sed -E 's#^https?://([^/]+).*$#\1#')
local resolve=(--resolve "${host}:443:127.0.0.1")
local setup_code
setup_code=$(curl -ks "${resolve[@]}" -o /dev/null -w '%{http_code}' "$AUTHENTIK_URL/if/flow/initial-setup/" || true)
#TODO: this is not a valid check if setup is already complete, needs work. Authentik returns 200 even if setup is complete
if [[ "$setup_code" == "200" ]]; then
return 0 # Setup is needed
else
return 1 # Setup is not needed
fi
}
# Main function
main() {
if check_setup_needed; then
echo -e "${YELLOW}📋 Initial setup is required${NC}"
if complete_initial_setup; then
echo -e "${GREEN}🎉 Authentik initial setup completed automatically!${NC}"
echo
echo -e "${BLUE}📋 Next steps:${NC}"
echo -e " 1. Run ${BLUE}make complete-authentik-setup${NC} to get API token"
echo -e " 2. Run ${BLUE}make setup-authentik${NC} to import blueprint configuration"
echo -e " 3. Or run ${BLUE}make setup-sso${NC} to do both automatically"
else
echo -e "${RED}❌ Automatic setup failed${NC}"
echo -e "${YELLOW}📋 Manual setup required:${NC}"
echo -e " 1. Open ${BLUE}https://auth.local.lan.lan/if/flow/initial-setup/${NC}"
echo -e " 2. Use credentials: ${BLUE}$ADMIN_EMAIL${NC} / ${BLUE}$ADMIN_PASSWORD${NC}"
fi
else
echo -e "${GREEN}✅ Authentik setup is already complete${NC}"
fi
# Cleanup
rm -f /tmp/authentik_setup_cookies.txt /tmp/setup_response.html
}
# Run main function
main "$@"

View File

@@ -38,14 +38,29 @@ async def test_backend_journey():
try: try:
# 2. Upload a document # 2. Upload a document
async with httpx.AsyncClient() as client: async with httpx.AsyncClient(
verify=False
) as client: # Disable SSL verification for local testing
# Create a dummy PDF file # Create a dummy PDF file
files = {"file": ("test.pdf", b"%PDF-1.4 mock content", "application/pdf")} # Create a valid minimal PDF file
pdf_content = (
b"%PDF-1.0\n1 0 obj<</Type/Catalog/Pages 2 0 R>>endobj 2 0 obj<</Type/Pages/Kids[3 0 R]/Count 1>>endobj "
b"3 0 obj<</Type/Page/MediaBox[0 0 3 3]/Parent 2 0 R/Resources<<>>>>endobj\nxref\n0 4\n0000000000 65535 f\n"
b"0000000010 00000 n\n0000000060 00000 n\n0000000111 00000 n\ntrailer<</Size 4/Root 1 0 R>>\nstartxref\n190\n%%EOF"
)
files = {"file": ("test.pdf", pdf_content, "application/pdf")}
response = await client.post( response = await client.post(
f"{INGESTION_URL}/upload", f"{INGESTION_URL}/upload",
files=files, files=files,
data={"kind": "invoice", "source": "e2e_test"}, data={"kind": "invoice", "source": "e2e_test"},
headers={"X-Tenant-ID": TENANT_ID, "X-User-ID": "e2e_tester"}, headers={
"X-Tenant-ID": TENANT_ID,
"X-User-ID": "e2e_tester",
# Required by TrustedProxyMiddleware
"X-Authenticated-User": "e2e_tester",
"X-Authenticated-Email": "e2e@example.com",
"Authorization": "Bearer mock-token",
},
) )
assert response.status_code == 200, f"Upload failed: {response.text}" assert response.status_code == 200, f"Upload failed: {response.text}"
upload_data = response.json() upload_data = response.json()