From eea46ac89cf04625bfacba8cd9d937d8e0e1de7b Mon Sep 17 00:00:00 2001 From: harkon Date: Tue, 14 Oct 2025 07:42:31 +0100 Subject: [PATCH] deployment, linting and infra configuration --- apps/svc_coverage/models.py | 2 +- apps/svc_coverage/requirements.txt | 13 + apps/svc_extract/requirements.txt | 12 +- apps/svc_firm_connectors/requirements.txt | 36 +- apps/svc_forms/requirements.txt | 26 +- apps/svc_hmrc/requirements.txt | 32 +- apps/svc_kg/requirements.txt | 14 +- apps/svc_normalize_map/requirements.txt | 34 +- apps/svc_ocr/main.py | 357 +++++++---- apps/svc_ocr/requirements.txt | 2 +- apps/svc_rag_indexer/requirements.txt | 2 +- apps/svc_reason/requirements.txt | 20 +- apps/svc_rpa/requirements.txt | 10 +- docs/DEPLOYMENT_PLAN.md | 27 +- docs/DEPLOYMENT_PROGRESS.md | 9 +- docs/DEPLOYMENT_STATUS.md | 21 +- docs/ENVIRONMENT_COMPARISON.md | 7 +- docs/GITEA_REGISTRY_DEBUG.md | 332 ----------- docs/GITEA_REGISTRY_FIX.md | 194 ------ docs/OPTIMIZATION_SUMMARY.md | 4 +- docs/QUICK_START.md | 38 +- libs/config.py => docs/SRE.md | 0 .../authentik/bootstrap.yaml | 0 infra/{configs => base}/loki/loki-config.yml | 0 .../promtail/promtail-config.yml | 0 infra/base/traefik/config/traefik-dynamic.yml | 46 ++ infra/base/traefik/config/traefik.yml | 4 +- infra/configs/traefik/app-middlewares.yml | 31 - infra/configs/traefik/certs/local.crt | 25 - infra/configs/traefik/certs/local.key | 28 - libs/neo/client.py | 6 +- .../certs/acme.json => libs/ocr/__init__.py | 0 libs/ocr/processor.py | 507 ++++++++++++++++ libs/requirements-base.txt | 8 +- libs/requirements-pdf.txt | 1 + libs/storage/client.py | 4 +- mypy.ini | 4 + pyproject.toml | 11 +- requirements.txt | 5 + scripts/deploy-to-production.sh | 36 +- tests/e2e/test_happy_path.py | 557 +----------------- 41 files changed, 1017 insertions(+), 1448 deletions(-) create mode 100644 apps/svc_coverage/requirements.txt delete mode 100644 docs/GITEA_REGISTRY_DEBUG.md delete mode 100644 docs/GITEA_REGISTRY_FIX.md rename libs/config.py => docs/SRE.md (100%) rename infra/{configs => base}/authentik/bootstrap.yaml (100%) rename infra/{configs => base}/loki/loki-config.yml (100%) rename infra/{configs => base}/promtail/promtail-config.yml (100%) delete mode 100644 infra/configs/traefik/app-middlewares.yml delete mode 100644 infra/configs/traefik/certs/local.crt delete mode 100644 infra/configs/traefik/certs/local.key rename infra/configs/traefik/certs/acme.json => libs/ocr/__init__.py (100%) create mode 100644 libs/ocr/processor.py diff --git a/apps/svc_coverage/models.py b/apps/svc_coverage/models.py index 2cd96b8..d0c04c2 100644 --- a/apps/svc_coverage/models.py +++ b/apps/svc_coverage/models.py @@ -1,6 +1,6 @@ """Database models for coverage service.""" -# FILE: apps/svc-coverage/models.py +# FILE: apps/svc_coverage/models.py from datetime import datetime diff --git a/apps/svc_coverage/requirements.txt b/apps/svc_coverage/requirements.txt new file mode 100644 index 0000000..4eff96b --- /dev/null +++ b/apps/svc_coverage/requirements.txt @@ -0,0 +1,13 @@ +# Service-specific dependencies for svc_coverage + +# Database migrations +alembic>=1.14.0 + +# OpenTelemetry (required by libs.observability) +opentelemetry-api>=1.37.0 +opentelemetry-sdk>=1.37.0 +opentelemetry-exporter-otlp-proto-grpc>=1.37.0 +opentelemetry-instrumentation-fastapi>=0.42b0 +opentelemetry-instrumentation-httpx>=0.42b0 +opentelemetry-instrumentation-psycopg2>=0.42b0 +opentelemetry-instrumentation-redis>=0.42b0 diff --git a/apps/svc_extract/requirements.txt b/apps/svc_extract/requirements.txt index 5e873fa..7b48d9b 100644 --- a/apps/svc_extract/requirements.txt +++ b/apps/svc_extract/requirements.txt @@ -1,17 +1,17 @@ # Service-specific dependencies for svc_extract # LLM integration -openai>=1.3.0 -anthropic>=0.7.0 +openai>=2.3.0 +anthropic>=0.69.0 # JSON schema validation -jsonschema>=4.20.0 +jsonschema>=4.25.1 # Template processing -jinja2>=3.1.0 +jinja2>=3.1.6 # Text similarity (lightweight) fuzzywuzzy>=0.18.0 -python-Levenshtein>=0.23.0 +python-Levenshtein>=0.27.1 # Data validation -cerberus>=1.3.4 +cerberus>=1.3.7 diff --git a/apps/svc_firm_connectors/requirements.txt b/apps/svc_firm_connectors/requirements.txt index cf274cf..48879c3 100644 --- a/apps/svc_firm_connectors/requirements.txt +++ b/apps/svc_firm_connectors/requirements.txt @@ -1,45 +1,45 @@ # FastAPI and server -fastapi>=0.104.1 -uvicorn[standard]>=0.24.0 +fastapi>=0.118.3 +uvicorn[standard]>=0.37.0 pydantic>=2.5.0 # Service-specific dependencies # Database connectors -sqlalchemy>=2.0.0 -pymssql>=2.2.0 +sqlalchemy>=2.0.44 +pymssql>=2.3.7 cx-Oracle>=8.3.0 # API clients for practice management systems -zeep>=4.2.0 # SOAP client -xmltodict>=0.13.0 +zeep>=4.3.2 # SOAP client +xmltodict>=1.0.2 # OAuth for various systems -authlib>=1.2.0 -requests-oauthlib>=1.3.0 +authlib>=1.6.5 +requests-oauthlib>=2.0.0 # Data synchronization -pandas>=2.1.0 +pandas>=2.3.3 # Rate limiting -ratelimit>=2.2.0 +ratelimit>=2.2.1 # Retry mechanisms -tenacity>=8.2.0 +tenacity>=9.1.2 # CSV processing -csvkit>=1.1.0 +csvkit>=2.1.0 # Excel file processing -openpyxl>=3.1.0 -xlrd>=2.0.0 +openpyxl>=3.1.5 +xlrd>=2.0.2 # Data validation -marshmallow>=3.20.0 -cerberus>=1.3.4 +marshmallow>=4.0.1 +cerberus>=1.3.7 # Connection pooling (built into SQLAlchemy) # sqlalchemy-pool>=1.3.0 # Package doesn't exist, pooling is built into SQLAlchemy # Additional utilities -python-dateutil>=2.8.0 -pytz>=2023.3 +python-dateutil>=2.9.0 +pytz>=2025.2 diff --git a/apps/svc_forms/requirements.txt b/apps/svc_forms/requirements.txt index 05028ec..2348e69 100644 --- a/apps/svc_forms/requirements.txt +++ b/apps/svc_forms/requirements.txt @@ -1,37 +1,37 @@ # FastAPI and server -fastapi>=0.104.1 -uvicorn[standard]>=0.24.0 -pydantic>=2.5.0 +fastapi>=0.118.3 +uvicorn[standard]>=0.37.0 +pydantic>=2.12.0 # Service-specific dependencies # PDF form filling pdfrw>=0.4 -reportlab>=4.0.0 +reportlab>=4.4.4 # PDF processing -PyPDF2>=3.0.0 -pypdf>=3.17.0 +PyPDF2>=3.0.1 +pypdf>=6.1.1 # Image processing for overlays -Pillow>=10.1.0 +Pillow>=11.3.0 # ZIP file creation for evidence packs zipfile36>=0.1.3 # Template processing -jinja2>=3.1.0 +jinja2>=3.1.6 # QR code generation -qrcode>=7.4.0 +qrcode>=8.2 # Barcode generation -python-barcode>=0.15.0 +python-barcode>=0.16.1 # Font handling -fonttools>=4.44.0 +fonttools>=4.60.1 # Additional PDF utilities -pdfminer.six>=20231228 +pdfminer.six>=20250506 # Document conversion -python-docx>=1.1.0 +python-docx>=1.2.0 diff --git a/apps/svc_hmrc/requirements.txt b/apps/svc_hmrc/requirements.txt index 174cd91..273ea8f 100644 --- a/apps/svc_hmrc/requirements.txt +++ b/apps/svc_hmrc/requirements.txt @@ -1,40 +1,40 @@ # FastAPI and server -fastapi>=0.104.1 -uvicorn[standard]>=0.24.0 -pydantic>=2.5.0 +fastapi>=0.118.3 +uvicorn[standard]>=0.37.0 +pydantic>=2.12.0 # Service-specific dependencies # OAuth and authentication -authlib>=1.2.0 -oauthlib>=3.2.0 +authlib>=1.6.5 +oauthlib>=3.3.1 # HTTP client with OAuth support -requests-oauthlib>=1.3.0 +requests-oauthlib>=2.0.0 # XML processing for HMRC APIs -lxml>=4.9.0 -xmltodict>=0.13.0 +lxml>=6.0.2 +xmltodict>=1.0.2 # JSON Web Tokens -pyjwt>=2.8.0 +pyjwt>=2.10.1 # UK government API utilities -govuk-frontend-jinja>=2.8.0 +govuk-frontend-jinja>=3.8.0 # Date and time for tax years -python-dateutil>=2.8.0 +python-dateutil>=2.9.0 # Retry mechanisms -tenacity>=8.2.0 +tenacity>=9.1.2 # Rate limiting -ratelimit>=2.2.0 +ratelimit>=2.2.1 # API validation -marshmallow>=3.20.0 +marshmallow>=4.0.1 # Encryption for sensitive data -cryptography>=41.0.0 +cryptography>=46.0.2 # Additional HTTP utilities -urllib3>=2.1.0 +urllib3>=2.5.0 diff --git a/apps/svc_kg/requirements.txt b/apps/svc_kg/requirements.txt index f743624..b9bc67f 100644 --- a/apps/svc_kg/requirements.txt +++ b/apps/svc_kg/requirements.txt @@ -1,22 +1,22 @@ # Service-specific dependencies # RDF and semantic web -rdflib>=7.0.0 -pyshacl>=0.25.0 +rdflib>=7.2.1 +pyshacl>=0.30.1 # Graph algorithms -networkx>=3.2.0 +networkx>=3.5 # Data export formats -xmltodict>=0.13.0 +xmltodict>=1.0.2 # Query optimization -pyparsing>=3.1.0 +pyparsing>=3.2.5 # Graph visualization (optional) -graphviz>=0.20.0 +graphviz>=0.21 # Additional Neo4j utilities -neomodel>=5.2.0 +neomodel>=5.5.3 # Cypher query building py2neo>=2021.2.4 diff --git a/apps/svc_normalize_map/requirements.txt b/apps/svc_normalize_map/requirements.txt index 1ca878a..bd26322 100644 --- a/apps/svc_normalize_map/requirements.txt +++ b/apps/svc_normalize_map/requirements.txt @@ -1,37 +1,37 @@ # FastAPI and server -fastapi>=0.104.1 -uvicorn[standard]>=0.24.0 -pydantic>=2.5.0 +fastapi>=0.118.3 +uvicorn[standard]>=0.37.0 +pydantic>=2.12.0 # Service-specific dependencies # Data normalization and cleaning -pandas>=2.1.0 -numpy>=1.24.0 +pandas>=2.3.3 +numpy>=2.3.3 # Currency and exchange rates -forex-python>=1.8 -babel>=2.13.0 +forex-python>=1.9.2 +babel>=2.17.0 # Date and time processing -python-dateutil>=2.8.0 -pytz>=2023.3 +python-dateutil>=2.9.0 +pytz>=2025.2 # Text normalization -unidecode>=1.3.0 -phonenumbers>=8.13.0 +unidecode>=1.4.0 +phonenumbers>=9.0.16 # Entity resolution and matching recordlinkage>=0.16.0 fuzzywuzzy>=0.18.0 -python-Levenshtein>=0.23.0 +python-Levenshtein>=0.27.1 # Geographic data -geopy>=2.4.0 -pycountry>=23.12.0 +geopy>=2.4.1 +pycountry>=24.6.1 # Data validation -cerberus>=1.3.4 -marshmallow>=3.20.0 +cerberus>=1.3.7 +marshmallow>=4.0.1 # UK-specific utilities -uk-postcode-utils>=1.0.0 +uk-postcode-utils>=1.1 diff --git a/apps/svc_ocr/main.py b/apps/svc_ocr/main.py index ae5b2cd..1377173 100644 --- a/apps/svc_ocr/main.py +++ b/apps/svc_ocr/main.py @@ -1,17 +1,23 @@ # FILE: apps/svc-ocr/main.py # OCR and layout extraction using Tesseract, LayoutLM, and document AI +import asyncio +import io import os # Import shared libraries import sys from datetime import datetime -from typing import Any +from typing import Any, cast +import pytesseract import structlog import ulid from fastapi import BackgroundTasks, Depends, HTTPException, Request from fastapi.responses import JSONResponse +from pdf2image import convert_from_bytes +from PIL import Image +from PyPDF2 import PdfReader sys.path.append(os.path.join(os.path.dirname(__file__), "..", "..")) @@ -19,6 +25,7 @@ from libs.app_factory import create_app from libs.config import BaseAppSettings, create_event_bus, create_minio_client from libs.events import EventBus, EventPayload, EventTopics from libs.observability import get_metrics, get_tracer, setup_observability +from libs.ocr.processor import OCRProcessor from libs.schemas import ErrorResponse from libs.security import get_current_user, get_tenant_id from libs.storage import DocumentStorage, StorageClient @@ -48,28 +55,31 @@ class OCRSettings(BaseAppSettings): include_coordinates: bool = True include_confidence: bool = True + # Vision/LLM OCR configuration + vision_provider: str = "ollama" # or "openai" + vision_model: str = "llama3.2-vision:11b" + vision_format: str = ( + "text" # text | markdown | json | table | key_value | structured + ) + vision_preprocess: bool = True + openai_base_url: str = "https://api.openai.com/v1/chat/completions" -# Create app and settings -app, settings = create_app( - service_name="svc-ocr", - title="Tax Agent OCR Service", - description="OCR and layout extraction service", - settings_class=OCRSettings, -) # fmt: skip # Global clients storage_client: StorageClient | None = None document_storage: DocumentStorage | None = None event_bus: EventBus | None = None -tracer = get_tracer("svc-ocr") -metrics = get_metrics() + +vision_processor: OCRProcessor | None = None +# Settings will be initialized after app creation +settings: OCRSettings -@app.on_event("startup") -async def startup_event() -> None: +async def init_dependencies(app_settings: OCRSettings) -> None: """Initialize service dependencies""" - global storage_client, document_storage, event_bus + global storage_client, document_storage, event_bus, settings, vision_processor + settings = app_settings logger.info("Starting OCR service") # Setup observability @@ -79,42 +89,44 @@ async def startup_event() -> None: minio_client = create_minio_client(settings) storage_client = StorageClient(minio_client) document_storage = DocumentStorage(storage_client) - # Initialize event bus event_bus = create_event_bus(settings) if not event_bus: raise HTTPException(status_code=500, detail="Event bus not initialized") - await event_bus.start() + eb = event_bus + # mypy: event_bus is Optional, so use local alias after check + await eb.start() # Subscribe to document ingestion events - await event_bus.subscribe(EventTopics.DOC_INGESTED, _handle_document_ingested) + await eb.subscribe(EventTopics.DOC_INGESTED, _handle_document_ingested) + + # Initialize shared OCRProcessor for vision strategy + try: + vision_processor = OCRProcessor( + model_name=settings.vision_model, + provider=settings.vision_provider, + openai_base_url=settings.openai_base_url, + ) + except Exception as e: + logger.error("Failed to initialize vision OCR processor", error=str(e)) logger.info("OCR service started successfully") -@app.on_event("shutdown") -async def shutdown_event() -> None: - """Cleanup service dependencies""" - global event_bus +# Create app and settings +app, _settings = create_app( + service_name="svc-ocr", + title="Tax Agent OCR Service", + description="OCR and layout extraction service", + settings_class=OCRSettings, +) # fmt: skip - logger.info("Shutting down OCR service") +# Initialize dependencies immediately +asyncio.run(init_dependencies(cast(OCRSettings, _settings))) - if event_bus: - await event_bus.stop() - - logger.info("OCR service shutdown complete") - - -@app.get("/health") -async def health_check() -> dict[str, Any]: - """Health check endpoint""" - return { - "status": "healthy", - "service": settings.service_name, - "version": settings.service_version, - "timestamp": datetime.utcnow().isoformat(), - } +tracer = get_tracer("svc-ocr") +metrics = get_metrics() @app.post("/process/{doc_id}") @@ -132,9 +144,14 @@ async def process_document( span.set_attribute("tenant_id", tenant_id) span.set_attribute("strategy", strategy) + ds = document_storage + if ds is None: + raise HTTPException( + status_code=500, detail="Document storage not initialized" + ) try: # Check if document exists - doc_content = await document_storage.get_document(tenant_id, doc_id) + doc_content = await ds.get_document(tenant_id, doc_id) if not doc_content: raise HTTPException(status_code=404, detail="Document not found") @@ -142,9 +159,9 @@ async def process_document( processing_id = str(ulid.new()) span.set_attribute("processing_id", processing_id) - # Start background processing + # Start background processing via sync wrapper (for mypy correctness) background_tasks.add_task( - _process_document_async, + _schedule_process_document_async, doc_id, tenant_id, doc_content, @@ -168,7 +185,9 @@ async def process_document( raise except Exception as e: logger.error("Failed to start OCR processing", doc_id=doc_id, error=str(e)) - raise HTTPException(status_code=500, detail="Failed to start processing") + raise HTTPException( + status_code=500, detail="Failed to start processing" + ) from e @app.get("/results/{doc_id}") @@ -183,9 +202,14 @@ async def get_ocr_results( span.set_attribute("doc_id", doc_id) span.set_attribute("tenant_id", tenant_id) + ds = document_storage + if ds is None: + raise HTTPException( + status_code=500, detail="Document storage not initialized" + ) try: # Get OCR results from storage - ocr_results = await document_storage.get_ocr_result(tenant_id, doc_id) + ocr_results = await ds.get_ocr_result(tenant_id, doc_id) if not ocr_results: raise HTTPException(status_code=404, detail="OCR results not found") @@ -196,26 +220,32 @@ async def get_ocr_results( raise except Exception as e: logger.error("Failed to get OCR results", doc_id=doc_id, error=str(e)) - raise HTTPException(status_code=500, detail="Failed to get OCR results") + raise HTTPException( + status_code=500, detail="Failed to get OCR results" + ) from e async def _handle_document_ingested(topic: str, payload: EventPayload) -> None: """Handle document ingestion events""" - try: - data = payload.data - doc_id = data.get("doc_id") - tenant_id = data.get("tenant_id") + data = payload.data + doc_id = data.get("doc_id") + tenant_id = data.get("tenant_id") - if not doc_id or not tenant_id: - logger.warning("Invalid document ingestion event", data=data) - return + if not doc_id or not tenant_id: + logger.warning("Invalid document ingestion event", data=data) + return + ds = document_storage + if ds is None: + logger.error("Document storage not initialized") + return - # Auto-process PDF documents - if data.get("content_type") == "application/pdf": - logger.info("Auto-processing ingested document", doc_id=doc_id) + # Auto-process PDF documents + if data.get("content_type") == "application/pdf": + logger.info("Auto-processing ingested document", doc_id=doc_id) + try: # Get document content - doc_content = await document_storage.get_document(tenant_id, doc_id) + doc_content = await ds.get_document(tenant_id, doc_id) if doc_content: await _process_document_async( doc_id=doc_id, @@ -225,9 +255,10 @@ async def _handle_document_ingested(topic: str, payload: EventPayload) -> None: processing_id=str(ulid.new()), actor=payload.actor, ) - - except Exception as e: - logger.error("Failed to handle document ingestion", error=str(e)) + except Exception as e: + logger.error( + "Failed to handle document ingestion", doc_id=doc_id, error=str(e) + ) async def _process_document_async( @@ -250,8 +281,8 @@ async def _process_document_async( images = await _pdf_to_images(content) # Process each page - pages_data: list[Any] = [] - for page_num, image in enumerate(images, 1): + pages_data: list[dict[str, Any]] = [] + for page_num, image in enumerate(images, 0): page_data = await _process_page(image, page_num, strategy) pages_data.append(page_data) @@ -270,7 +301,10 @@ async def _process_document_async( } # Store results - await document_storage.store_ocr_result(tenant_id, doc_id, ocr_results) + ds = document_storage + if ds is None: + raise RuntimeError("Document storage not initialized") + await ds.store_ocr_result(tenant_id, doc_id, ocr_results) # Update metrics metrics.counter("documents_processed_total").labels( @@ -282,7 +316,7 @@ async def _process_document_async( ).observe( datetime.utcnow().timestamp() - datetime.fromisoformat( - ocr_results["processed_at"].replace("Z", "") + ocr_results["processed_at"].replace("Z", "") # type: ignore ).timestamp() ) @@ -300,7 +334,9 @@ async def _process_document_async( tenant_id=tenant_id, ) - await event_bus.publish(EventTopics.DOC_OCR_READY, event_payload) + eb = event_bus + if eb is not None: + await eb.publish(EventTopics.DOC_OCR_READY, event_payload) logger.info( "OCR processing completed", doc_id=doc_id, pages=len(pages_data) @@ -316,58 +352,91 @@ async def _process_document_async( async def _pdf_to_images(pdf_content: bytes) -> list[bytes]: - """Convert PDF to images""" + """Convert PDF to page images without PyMuPDF. + + Primary: pdf2image (requires poppler). Fallback: extract largest embedded image per page via PyPDF2/Pillow. + """ + # First try pdf2image for full-page rasterization try: - import fitz # PyMuPDF - - # Open PDF - pdf_doc = fitz.open(stream=pdf_content, filetype="pdf") - - images: list[Any] = [] - for page_num in range(min(len(pdf_doc), settings.max_pages)): - page = pdf_doc[page_num] - - # Render page to image - mat = fitz.Matrix(2.0, 2.0) # 2x zoom for better OCR - pix = page.get_pixmap(matrix=mat) - img_data = pix.tobytes("png") - - images.append(img_data) - - pdf_doc.close() - return images - - except ImportError: - logger.error("PyMuPDF not available, using fallback") - return await _pdf_to_images_fallback(pdf_content) - except Exception as e: - logger.error("PDF conversion failed", error=str(e)) - raise - - -async def _pdf_to_images_fallback(pdf_content: bytes) -> list[bytes]: - """Fallback PDF to images conversion""" - try: - from pdf2image import convert_from_bytes - images = convert_from_bytes( pdf_content, dpi=200, first_page=1, last_page=settings.max_pages ) - - # Convert PIL images to bytes - image_bytes: list[Any] = [] + image_bytes: list[bytes] = [] for img in images: - import io - img_buffer = io.BytesIO() img.save(img_buffer, format="PNG") image_bytes.append(img_buffer.getvalue()) - return image_bytes + except Exception as e: + logger.warning( + "pdf2image conversion failed; falling back to PyPDF2", error=str(e) + ) - except ImportError: - logger.error("pdf2image not available") - raise Exception("No PDF conversion library available") + # Fallback: extract largest embedded image per page using PyPDF2 + try: + reader = PdfReader(io.BytesIO(pdf_content)) + out_images: list[bytes] = [] + for page_index, page in enumerate(reader.pages): + if page_index >= settings.max_pages: + break + try: + resources = page.get("/Resources") + if resources is None: + continue + xobject = resources.get("/XObject") + if xobject is None: + continue + xobject = xobject.get_object() + + largest = None + largest_area = -1 + for _, obj_ref in xobject.items(): + try: + obj = obj_ref.get_object() + if obj.get("/Subtype") != "/Image": + continue + width = int(obj.get("/Width", 0)) + height = int(obj.get("/Height", 0)) + area = width * height + if area > largest_area: + largest = obj + largest_area = area + except Exception: + continue + + if largest is None: + continue + + data = largest.get_data() + filt = largest.get("/Filter") + + if filt in ("/DCTDecode", "/JPXDecode"): + # JPEG or JPEG2000 + out_images.append(data) + else: + # Flate or other; decode via Pillow + mode = "RGB" + colorspace = largest.get("/ColorSpace") + if colorspace in ("/DeviceGray",): + mode = "L" + width = int(largest.get("/Width", 0)) + height = int(largest.get("/Height", 0)) + try: + img = Image.frombytes(mode, (width, height), data) + except Exception: + img = Image.open(io.BytesIO(data)) + buf = io.BytesIO() + img.save(buf, format="PNG") + out_images.append(buf.getvalue()) + except Exception: + continue + + if not out_images: + raise RuntimeError("No images extracted via PyPDF2 fallback") + return out_images + except Exception as fallback_e: + logger.error("PDF conversion failed (both methods)", error=str(fallback_e)) + raise async def _process_page( @@ -395,6 +464,8 @@ async def _process_page( layoutlm_result.get("confidence", 0), ), } + elif strategy == "vision": + return await _process_with_vision(image_data, page_num) else: raise ValueError(f"Unknown strategy: {strategy}") @@ -402,11 +473,6 @@ async def _process_page( async def _process_with_tesseract(image_data: bytes, page_num: int) -> dict[str, Any]: """Process page with Tesseract OCR""" try: - import io - - import pytesseract - from PIL import Image - # Load image image = Image.open(io.BytesIO(image_data)) @@ -414,13 +480,13 @@ async def _process_with_tesseract(image_data: bytes, page_num: int) -> dict[str, config = f"{settings.tesseract_config} -l {settings.languages}" # Extract text with confidence - data = pytesseract.image_to_data( + data = pytesseract.image_to_data( # type: ignore image, config=config, output_type=pytesseract.Output.DICT ) # Process results - words: list[Any] = [] - confidences: list[Any] = [] + words: list[dict[str, Any]] = [] + confidences: list[float] = [] for i in range(len(data["text"])): if int(data["conf"][i]) > 0: # Valid confidence @@ -449,13 +515,6 @@ async def _process_with_tesseract(image_data: bytes, page_num: int) -> dict[str, "word_count": len(words), } - except ImportError: - logger.error("pytesseract not available") - return { - "page": page_num, - "strategy": "tesseract", - "error": "pytesseract not available", - } except Exception as e: logger.error("Tesseract processing failed", page=page_num, error=str(e)) return {"page": page_num, "strategy": "tesseract", "error": str(e)} @@ -482,6 +541,68 @@ async def _process_with_layoutlm(image_data: bytes, page_num: int) -> dict[str, return {"page": page_num, "strategy": "layoutlm", "error": str(e)} +async def _process_with_vision(image_data: bytes, page_num: int) -> dict[str, Any]: + """Process page with LLM vision OCR via shared OCRProcessor""" + try: + vp = vision_processor + if vp is None: + raise RuntimeError("Vision OCR processor not initialized") + + # Persist the page image temporarily for the processor API + import tempfile + + with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp: + tmp.write(image_data) + tmp_path = tmp.name + + try: + text = vp.process_image( + image_path=tmp_path, + format_type=settings.vision_format, + preprocess=settings.vision_preprocess, + language=settings.languages, + ) + finally: + try: + os.remove(tmp_path) + except OSError: + pass + + return { + "page": page_num, + "strategy": "vision", + "text": text if isinstance(text, str) else str(text), + "confidence": 0.0, # Not provided by LLM API + } + except Exception as e: + logger.error("Vision processing failed", page=page_num, error=str(e)) + return {"page": page_num, "strategy": "vision", "error": str(e)} + + +def _schedule_process_document_async( + doc_id: str, + tenant_id: str, + content: bytes, + strategy: str, + processing_id: str, + actor: str, +) -> None: + """Sync wrapper to schedule the async OCR task. + + This keeps FastAPI BackgroundTasks type expectations satisfied under mypy strict. + """ + asyncio.create_task( + _process_document_async( + doc_id=doc_id, + tenant_id=tenant_id, + content=content, + strategy=strategy, + processing_id=processing_id, + actor=actor, + ) + ) + + @app.exception_handler(HTTPException) async def http_exception_handler(request: Request, exc: HTTPException) -> JSONResponse: """Handle HTTP exceptions with RFC7807 format""" diff --git a/apps/svc_ocr/requirements.txt b/apps/svc_ocr/requirements.txt index 47800cf..1777a11 100644 --- a/apps/svc_ocr/requirements.txt +++ b/apps/svc_ocr/requirements.txt @@ -5,7 +5,7 @@ pytesseract>=0.3.13 # PDF processing -PyMuPDF>=1.26.4 +PyPDF2>=3.0.1 pdf2image>=1.17.0 # Image processing diff --git a/apps/svc_rag_indexer/requirements.txt b/apps/svc_rag_indexer/requirements.txt index 06be863..5a8732f 100644 --- a/apps/svc_rag_indexer/requirements.txt +++ b/apps/svc_rag_indexer/requirements.txt @@ -2,7 +2,7 @@ # NOTE: ML dependencies (sentence-transformers, transformers, torch, numpy) are in base-ml image # Text chunking (lightweight alternative to langchain) -tiktoken>=0.11.0 +tiktoken>=0.12.0 # Text preprocessing (lightweight) beautifulsoup4>=4.14.2 diff --git a/apps/svc_reason/requirements.txt b/apps/svc_reason/requirements.txt index 7bcc998..ce6c4a2 100644 --- a/apps/svc_reason/requirements.txt +++ b/apps/svc_reason/requirements.txt @@ -1,20 +1,20 @@ # FastAPI and server -fastapi>=0.104.1 -uvicorn[standard]>=0.24.0 -pydantic>=2.5.0 +fastapi>=0.118.3 +uvicorn[standard]>=0.37.0 +pydantic>=2.12.0 # Service-specific dependencies # Mathematical calculations # decimal is part of Python standard library -sympy>=1.12.0 +sympy>=1.14.0 # Tax calculations numpy>=2.3.3 -pandas>=2.1.0 +pandas>=2.3.3 # Date and time calculations -python-dateutil>=2.8.0 -pytz>=2023.3 +python-dateutil>=2.9.0 +pytz>=2025.2 # UK tax specific # uk-tax-calculator>=1.0.0 # Package may not exist, commenting out @@ -26,10 +26,10 @@ pytz>=2023.3 # quantlib>=1.32.0 # Package may not exist, commenting out # Data validation -cerberus>=1.3.4 +cerberus>=1.3.7 # Template processing for explanations -jinja2>=3.1.0 +jinja2>=3.1.6 # Statistical calculations -scipy>=1.11.0 +scipy>=1.16.2 diff --git a/apps/svc_rpa/requirements.txt b/apps/svc_rpa/requirements.txt index b090cee..c882601 100644 --- a/apps/svc_rpa/requirements.txt +++ b/apps/svc_rpa/requirements.txt @@ -1,11 +1,11 @@ # FastAPI and server -fastapi>=0.104.1 -uvicorn[standard]>=0.24.0 -pydantic>=2.5.0 +fastapi>=0.118.3 +uvicorn[standard]>=0.37.0 +pydantic>=2.12.0 # Service-specific dependencies # Browser automation -playwright>=1.40.0 +playwright>=1.55.0 # Additional async utilities # asyncio-timeout>=4.0.3 # Deprecated, use asyncio.timeout from Python 3.11+ standard library @@ -14,4 +14,4 @@ playwright>=1.40.0 aioredis>=2.0.1 # Browser management -psutil>=5.9.0 +psutil>=7.1.0 diff --git a/docs/DEPLOYMENT_PLAN.md b/docs/DEPLOYMENT_PLAN.md index a72078d..95f66b5 100644 --- a/docs/DEPLOYMENT_PLAN.md +++ b/docs/DEPLOYMENT_PLAN.md @@ -7,6 +7,7 @@ This plan outlines the strategy to host both the **AI Tax Agent application** an ## Current State Analysis ### Remote Server (`141.136.35.199`) + - **Location**: `/opt/compose/` - **Existing Services**: - Traefik v3.5.1 (reverse proxy with GoDaddy DNS challenge) @@ -25,6 +26,7 @@ This plan outlines the strategy to host both the **AI Tax Agent application** an - `portainer.harkon.co.uk` ### Local Repository (`infra/compose/`) + - **Compose Files**: - `docker-compose.local.yml` - Full stack for local development - `docker-compose.backend.yml` - Backend services (appears to be production-ready) @@ -39,25 +41,30 @@ This plan outlines the strategy to host both the **AI Tax Agent application** an ## Challenges & Conflicts ### 1. **Duplicate Services** + - Both environments have Traefik and Authentik - Need to decide: shared vs. isolated ### 2. **Network Naming** + - Remote: `frontend`, `backend` - Local: `ai-tax-agent-frontend`, `ai-tax-agent-backend` - Production needs: Consistent naming ### 3. **Domain Management** + - Remote: `*.harkon.co.uk` (public) - Local: `*.local.lan` (development) - Production: Need subdomains like `app.harkon.co.uk`, `api.harkon.co.uk` ### 4. **SSL Certificates** + - Remote: GoDaddy DNS challenge (production) - Local: Self-signed certificates - Production: Must use GoDaddy DNS challenge ### 5. **Resource Isolation** + - Company services need to remain stable - Application services need independent deployment/rollback @@ -66,6 +73,7 @@ This plan outlines the strategy to host both the **AI Tax Agent application** an We will deploy the company services and the AI Tax Agent as two fully isolated stacks, each with its own Traefik and Authentik. This maximizes blast-radius isolation and avoids naming and DNS conflicts across environments. Key implications: + - Separate external networks and DNS namespaces per stack - Duplicate edge (Traefik) and IdP (Authentik), independent upgrades and rollbacks - Slightly higher resource usage in exchange for strong isolation @@ -139,6 +147,7 @@ Key implications: ### Domain Mapping **Company Services** (existing): + - `traefik.harkon.co.uk` - Traefik dashboard - `auth.harkon.co.uk` - Authentik SSO - `gitea.harkon.co.uk` - Git hosting @@ -146,6 +155,7 @@ Key implications: - `portainer.harkon.co.uk` - Docker management **Application Services** (app stack): + - `review.` - Review UI - `api.` - API Gateway (microservices via Traefik) - `vault.` - Vault UI (admin only) @@ -159,12 +169,14 @@ Key implications: ### Authentication Strategy **Authentik Configuration**: + 1. **Company Group** - Access to Gitea, Nextcloud, Portainer 2. **App Admin Group** - Full access to all app services 3. **App User Group** - Access to Review UI and API 4. **App Reviewer Group** - Access to Review UI only **Middleware Configuration**: + - `authentik-forwardauth` - Standard auth for all services - `admin-auth` - Requires admin group (Vault, MinIO, Neo4j, etc.) - `reviewer-auth` - Requires reviewer or higher @@ -182,6 +194,7 @@ Key implications: ### Development Environment **Keep Existing Setup**: + - Use `docker-compose.local.yml` as-is - Domain: `*.local.lan` - Self-signed certificates @@ -189,6 +202,7 @@ Key implications: - Full stack runs locally **Benefits**: + - No dependency on remote server - Fast iteration - Complete isolation @@ -217,19 +231,22 @@ make deploy-production # Deploy to remote server ### Phase 1: Preparation (Week 1) 1. **Backup Current State** + ```bash ssh deploy@141.136.35.199 - cd /opt/compose + cd /opt tar -czf ~/backup-$(date +%Y%m%d).tar.gz . ``` 2. **Create Production Environment File** - - Copy `infra/compose/env.example` to `infra/compose/.env.production` + + - Copy `infra/environments/production/.env.example` to `infra/environments/production/.env` - Update all secrets and passwords - Set `DOMAIN=harkon.co.uk` - Configure GoDaddy API credentials 3. **Update Traefik Configuration** + - Merge local Traefik config with remote - Add application routes - Configure Authentik ForwardAuth @@ -242,13 +259,15 @@ make deploy-production # Deploy to remote server ### Phase 2: Infrastructure Deployment (Week 2) 1. **Deploy Application Infrastructure** + ```bash # On remote server - cd /opt/compose/ai-tax-agent + cd /opt/ai-tax-agent docker compose -f infrastructure.yaml up -d ``` 2. **Initialize Services** + - Vault: Unseal and configure - Postgres: Run migrations - Neo4j: Install plugins @@ -262,11 +281,13 @@ make deploy-production # Deploy to remote server ### Phase 3: Application Deployment (Week 3) 1. **Deploy Microservices** + ```bash docker compose -f services.yaml up -d ``` 2. **Deploy Monitoring** + ```bash docker compose -f monitoring.yaml up -d ``` diff --git a/docs/DEPLOYMENT_PROGRESS.md b/docs/DEPLOYMENT_PROGRESS.md index 3c1f0b9..46bd2ed 100644 --- a/docs/DEPLOYMENT_PROGRESS.md +++ b/docs/DEPLOYMENT_PROGRESS.md @@ -10,7 +10,7 @@ ### 1. Production Compose Files Created -Created three production-ready Docker Compose files in `infra/compose/production/`: +Created three production-ready Docker Compose files in `infra/base/`: #### **infrastructure.yaml** - Vault (secrets management) @@ -104,7 +104,7 @@ chmod +x scripts/deploy-to-production.sh ### 3. Documentation Created -#### **infra/compose/production/README.md** +#### **infra/base manifests** Comprehensive production deployment guide including: - Prerequisites checklist - Three deployment options (automated, step-by-step, manual) @@ -221,7 +221,7 @@ Or step-by-step: 1. **Initialize Vault** ```bash ssh deploy@141.136.35.199 - cd /opt/compose/ai-tax-agent + cd /opt/ai-tax-agent docker exec -it vault vault operator init # Save unseal keys! docker exec -it vault vault operator unseal @@ -382,7 +382,6 @@ Deployment is successful when: If you encounter issues: 1. Check logs: `./scripts/deploy-to-production.sh logs ` 2. Verify status: `./scripts/deploy-to-production.sh verify` -3. Review documentation: `infra/compose/production/README.md` +3. Review manifests: `infra/base/*.yaml` 4. Check deployment plan: `docs/DEPLOYMENT_PLAN.md` 5. Follow checklist: `docs/DEPLOYMENT_CHECKLIST.md` - diff --git a/docs/DEPLOYMENT_STATUS.md b/docs/DEPLOYMENT_STATUS.md index 788b7b7..bd779dc 100644 --- a/docs/DEPLOYMENT_STATUS.md +++ b/docs/DEPLOYMENT_STATUS.md @@ -21,15 +21,14 @@ - ✅ Created quick start guide (`docs/QUICK_START.md`) ### 3. Production Configuration Files -- ✅ Created `infra/compose/production/infrastructure.yaml` (7 infrastructure services) -- ✅ Created `infra/compose/production/services.yaml` (14 application services + UI) -- ✅ Created `infra/compose/production/monitoring.yaml` (Prometheus, Grafana, Loki, Promtail) -- ✅ Created `infra/compose/production/README.md` (deployment guide) +- ✅ Created `infra/base/infrastructure.yaml` (infrastructure, incl. Traefik + Authentik) +- ✅ Created `infra/base/services.yaml` (application services + UI) +- ✅ Created `infra/base/monitoring.yaml` (Prometheus, Grafana, Loki, Promtail) ### 4. Monitoring Configuration -- ✅ Created Prometheus configuration (`infra/compose/prometheus/prometheus.yml`) -- ✅ Created Loki configuration (`infra/compose/loki/loki-config.yml`) -- ✅ Created Promtail configuration (`infra/compose/promtail/promtail-config.yml`) +- ✅ Created Prometheus configuration (`infra/base/prometheus/prometheus.yml`) +- ✅ Created Loki configuration (`infra/base/loki/loki-config.yml`) +- ✅ Created Promtail configuration (`infra/base/promtail/promtail-config.yml`) - ✅ Configured service discovery for all 14 services - ✅ Set up 30-day metrics retention @@ -266,10 +265,9 @@ df -h - `docs/ENVIRONMENT_COMPARISON.md` - Local vs Production comparison 2. **Configuration:** - - `infra/compose/production/README.md` - Production compose guide - - `infra/compose/production/infrastructure.yaml` - Infrastructure services - - `infra/compose/production/services.yaml` - Application services - - `infra/compose/production/monitoring.yaml` - Monitoring stack + - `infra/base/infrastructure.yaml` - Infrastructure services + - `infra/base/services.yaml` - Application services + - `infra/base/monitoring.yaml` - Monitoring stack 3. **Deployment:** - `docs/POST_BUILD_DEPLOYMENT.md` - Post-build deployment steps @@ -319,4 +317,3 @@ For questions or issues: - 🟡 In Progress - ⏳ Pending - ❌ Blocked - diff --git a/docs/ENVIRONMENT_COMPARISON.md b/docs/ENVIRONMENT_COMPARISON.md index 8eb4a05..61e8e06 100644 --- a/docs/ENVIRONMENT_COMPARISON.md +++ b/docs/ENVIRONMENT_COMPARISON.md @@ -12,7 +12,7 @@ This document compares the local development environment with the production env | **SSL** | Self-signed certificates | Let's Encrypt (GoDaddy DNS) | | **Networks** | `ai-tax-agent-frontend`
`ai-tax-agent-backend` | `frontend`
`backend` | | **Compose File** | `docker-compose.local.yml` | `infrastructure.yaml`
`services.yaml`
`monitoring.yaml` | -| **Location** | Local machine | `deploy@141.136.35.199:/opt/compose/ai-tax-agent/` | +| **Location** | Local machine | `deploy@141.136.35.199:/opt/ai-tax-agent/` | | **Traefik** | Isolated instance | Shared with company services | | **Authentik** | Isolated instance | Shared with company services | | **Data Persistence** | Local Docker volumes | Remote Docker volumes + backups | @@ -271,7 +271,7 @@ make clean #### Production ```bash # Deploy infrastructure -cd /opt/compose/ai-tax-agent +cd /opt/ai-tax-agent docker compose -f infrastructure.yaml up -d # Deploy services @@ -370,7 +370,7 @@ docker compose -f services.yaml up -d --no-deps svc-ingestion 4. **Deploy to production**: ```bash ssh deploy@141.136.35.199 - cd /opt/compose/ai-tax-agent + cd /opt/ai-tax-agent docker compose -f services.yaml pull docker compose -f services.yaml up -d ``` @@ -436,4 +436,3 @@ The key differences between local and production environments are: 6. **Backups**: Local has none; production has automated backups Both environments use the same application code and Docker images, ensuring consistency and reducing deployment risks. - diff --git a/docs/GITEA_REGISTRY_DEBUG.md b/docs/GITEA_REGISTRY_DEBUG.md deleted file mode 100644 index a0c5f6b..0000000 --- a/docs/GITEA_REGISTRY_DEBUG.md +++ /dev/null @@ -1,332 +0,0 @@ -# Gitea Container Registry Debugging Guide - -## Common Issues When Pushing Large Docker Images - -### Issue 1: Not Logged In - -**Symptom**: `unauthorized: authentication required` - -**Solution**: -```bash -# On remote server -docker login gitea.harkon.co.uk -# Username: blue (or your Gitea username) -# Password: -``` - ---- - -### Issue 2: Upload Size Limit (413 Request Entity Too Large) - -**Symptom**: Push fails with `413 Request Entity Too Large` or similar error - -**Root Cause**: Traefik or Gitea has a limit on request body size - -**Solution A: Configure Traefik Middleware** - -1. Find your Traefik configuration directory: -```bash -docker inspect traefik | grep -A 10 Mounts -``` - -2. Create middleware configuration: -```bash -# Example: /opt/traefik/config/middlewares.yml -sudo tee /opt/traefik/config/middlewares.yml > /dev/null << 'EOF' -http: - middlewares: - large-upload: - buffering: - maxRequestBodyBytes: 5368709120 # 5GB - memRequestBodyBytes: 104857600 # 100MB - maxResponseBodyBytes: 5368709120 # 5GB - memResponseBodyBytes: 104857600 # 100MB -EOF -``` - -3. Update Gitea container labels: -```yaml -labels: - - "traefik.http.routers.gitea.middlewares=large-upload@file" -``` - -4. Restart Traefik: -```bash -docker restart traefik -``` - -**Solution B: Configure Gitea Directly** - -1. Edit Gitea configuration: -```bash -docker exec -it gitea-server vi /data/gitea/conf/app.ini -``` - -2. Add/modify these settings: -```ini -[server] -LFS_MAX_FILE_SIZE = 5368709120 ; 5GB - -[repository.upload] -FILE_MAX_SIZE = 5368709120 ; 5GB -``` - -3. Restart Gitea: -```bash -docker restart gitea-server -``` - ---- - -### Issue 3: Network Timeout - -**Symptom**: Push hangs or times out after uploading for a while - -**Root Cause**: Network instability or slow connection - -**Solution**: Use chunked uploads or increase timeout - -1. Configure Docker daemon timeout: -```bash -# Edit /etc/docker/daemon.json -sudo tee /etc/docker/daemon.json > /dev/null << 'EOF' -{ - "max-concurrent-uploads": 1, - "max-concurrent-downloads": 3, - "registry-mirrors": [] -} -EOF - -sudo systemctl restart docker -``` - -2. Or use Traefik timeout middleware: -```yaml -http: - middlewares: - long-timeout: - buffering: - retryExpression: "IsNetworkError() && Attempts() < 3" -``` - ---- - -### Issue 4: Disk Space - -**Symptom**: Push fails with "no space left on device" - -**Solution**: -```bash -# Check disk space -df -h - -# Clean up Docker -docker system prune -a --volumes -f - -# Check again -df -h -``` - ---- - -### Issue 5: Gitea Registry Not Enabled - -**Symptom**: `404 Not Found` when accessing `/v2/` - -**Solution**: -```bash -# Check if registry is enabled -docker exec gitea-server cat /data/gitea/conf/app.ini | grep -A 5 "\[packages\]" - -# Should show: -# [packages] -# ENABLED = true -``` - -If not enabled, add to `app.ini`: -```ini -[packages] -ENABLED = true -``` - -Restart Gitea: -```bash -docker restart gitea-server -``` - ---- - -## Debugging Steps - -### Step 1: Verify Gitea Registry is Accessible - -```bash -# Should return 401 Unauthorized (which is good - means registry is working) -curl -I https://gitea.harkon.co.uk/v2/ - -# Should return 200 OK after login -docker login gitea.harkon.co.uk -curl -u "username:token" https://gitea.harkon.co.uk/v2/ -``` - -### Step 2: Test with Small Image - -```bash -# Pull a small image -docker pull alpine:latest - -# Tag it for your registry -docker tag alpine:latest gitea.harkon.co.uk/harkon/test:latest - -# Try to push -docker push gitea.harkon.co.uk/harkon/test:latest -``` - -If this works, the issue is with large images (size limit). - -### Step 3: Check Gitea Logs - -```bash -# Check for errors -docker logs gitea-server --tail 100 | grep -i error - -# Watch logs in real-time while pushing -docker logs -f gitea-server -``` - -### Step 4: Check Traefik Logs - -```bash -# Check for 413 or 502 errors -docker logs traefik --tail 100 | grep -E "413|502|error" - -# Watch logs in real-time -docker logs -f traefik -``` - -### Step 5: Check Docker Daemon Logs - -```bash -# Check Docker daemon logs -sudo journalctl -u docker --since "1 hour ago" | grep -i error -``` - ---- - -## Quick Fix: Bypass Traefik for Registry - -If Traefik is causing issues, you can expose Gitea's registry directly: - -1. Update Gitea docker-compose to expose port 3000: -```yaml -services: - gitea: - ports: - - "3000:3000" # HTTP -``` - -2. Use direct connection: -```bash -docker login gitea.harkon.co.uk:3000 -docker push gitea.harkon.co.uk:3000/harkon/base-ml:v1.0.1 -``` - -**Note**: This bypasses SSL, so only use for debugging! - ---- - -## Recommended Configuration for Large Images - -### Traefik Configuration - -Create `/opt/traefik/config/gitea-registry.yml`: - -```yaml -http: - middlewares: - gitea-registry: - buffering: - maxRequestBodyBytes: 5368709120 # 5GB - memRequestBodyBytes: 104857600 # 100MB in memory - maxResponseBodyBytes: 5368709120 # 5GB - memResponseBodyBytes: 104857600 # 100MB in memory - - routers: - gitea-registry: - rule: "Host(`gitea.harkon.co.uk`) && PathPrefix(`/v2/`)" - entryPoints: - - websecure - middlewares: - - gitea-registry - service: gitea - tls: - certResolver: letsencrypt -``` - -### Gitea Configuration - -In `/data/gitea/conf/app.ini`: - -```ini -[server] -PROTOCOL = http -DOMAIN = gitea.harkon.co.uk -ROOT_URL = https://gitea.harkon.co.uk/ -HTTP_PORT = 3000 -LFS_MAX_FILE_SIZE = 5368709120 - -[repository.upload] -FILE_MAX_SIZE = 5368709120 -ENABLED = true - -[packages] -ENABLED = true -CHUNKED_UPLOAD_PATH = /data/gitea/tmp/package-upload -``` - ---- - -## Testing the Fix - -After applying configuration changes: - -1. Restart services: -```bash -docker restart traefik -docker restart gitea-server -``` - -2. Test with a large layer: -```bash -# Build base-ml (has large layers) -cd /home/deploy/ai-tax-agent -docker build -f infra/docker/base-ml.Dockerfile -t gitea.harkon.co.uk/harkon/base-ml:test . - -# Try to push -docker push gitea.harkon.co.uk/harkon/base-ml:test -``` - -3. Monitor logs: -```bash -# Terminal 1: Watch Traefik -docker logs -f traefik - -# Terminal 2: Watch Gitea -docker logs -f gitea-server - -# Terminal 3: Push image -docker push gitea.harkon.co.uk/harkon/base-ml:test -``` - ---- - -## Alternative: Use Docker Hub or GitHub Container Registry - -If Gitea continues to have issues with large images, consider: - -1. **Docker Hub**: Free for public images -2. **GitHub Container Registry (ghcr.io)**: Free for public/private -3. **GitLab Container Registry**: Free tier available - -These are battle-tested for large ML images and have better defaults for large uploads. - diff --git a/docs/GITEA_REGISTRY_FIX.md b/docs/GITEA_REGISTRY_FIX.md deleted file mode 100644 index fdebddf..0000000 --- a/docs/GITEA_REGISTRY_FIX.md +++ /dev/null @@ -1,194 +0,0 @@ -# Gitea Container Registry - Image Naming Fix - -## Issue - -The initial build script was using incorrect image naming convention for Gitea's container registry. - -### Incorrect Format - -``` -gitea.harkon.co.uk/ai-tax-agent/svc-ingestion:v1.0.0 -``` - -### Correct Format (Per Gitea Documentation) - -``` -gitea.harkon.co.uk/{owner}/{image}:{tag} -``` - -Where `{owner}` must be your **Gitea username** or **organization name**. - -**Using organization:** `harkon` (Gitea team/organization) - -## Solution - -Updated the build script and production compose files to use the correct naming convention. - -### Changes Made - -#### 1. Build Script (`scripts/build-and-push-images.sh`) - -**Before:** - -```bash -REGISTRY="${1:-gitea.harkon.co.uk}" -VERSION="${2:-latest}" -PROJECT="ai-tax-agent" - -IMAGE_NAME="$REGISTRY/$PROJECT/$service:$VERSION" -``` - -**After:** - -```bash -REGISTRY="${1:-gitea.harkon.co.uk}" -VERSION="${2:-latest}" -OWNER="${3:-harkon}" # Gitea organization/team name - -IMAGE_NAME="$REGISTRY/$OWNER/$service:$VERSION" -``` - -#### 2. Production Services (`infra/compose/production/services.yaml`) - -**Before:** - -```yaml -svc-ingestion: - image: gitea.harkon.co.uk/ai-tax-agent/svc-ingestion:latest -``` - -**After:** - -```yaml -svc-ingestion: - image: gitea.harkon.co.uk/harkon/svc-ingestion:latest -``` - -All 14 services updated: - -- svc-ingestion -- svc-extract -- svc-kg -- svc-rag-retriever -- svc-rag-indexer -- svc-forms -- svc-hmrc -- svc-ocr -- svc-rpa -- svc-normalize-map -- svc-reason -- svc-firm-connectors -- svc-coverage -- ui-review - -## Usage - -### Build and Push Images - -```bash -# With default owner (harkon organization) -./scripts/build-and-push-images.sh gitea.harkon.co.uk v1.0.1 - -# With custom owner -./scripts/build-and-push-images.sh gitea.harkon.co.uk v1.0.1 -``` - -### Pull Images - -```bash -docker pull gitea.harkon.co.uk/harkon/svc-ingestion:v1.0.1 -``` - -### Push Images Manually - -```bash -# Tag image -docker tag my-image:latest gitea.harkon.co.uk/harkon/my-image:v1.0.1 - -# Push image -docker push gitea.harkon.co.uk/harkon/my-image:v1.0.1 -``` - -## Gitea Registry Documentation Reference - -From Gitea's official documentation: - -### Image Naming Convention - -Images must follow this naming convention: - -``` -{registry}/{owner}/{image} -``` - -When building your docker image, using the naming convention above, this looks like: - -```bash -# build an image with tag -docker build -t {registry}/{owner}/{image}:{tag} . - -# name an existing image with tag -docker tag {some-existing-image}:{tag} {registry}/{owner}/{image}:{tag} -``` - -### Valid Examples - -For owner `testuser` on `gitea.example.com`: - -- ✅ `gitea.example.com/testuser/myimage` -- ✅ `gitea.example.com/testuser/my-image` -- ✅ `gitea.example.com/testuser/my/image` - -### Important Notes - -1. **Owner must exist**: The owner (username or organization) must exist in Gitea -2. **Case-insensitive tags**: `image:tag` and `image:Tag` are treated as the same -3. **Authentication required**: Use personal access token with `write:package` scope -4. **Registry URL**: Use the main Gitea domain, not a separate registry subdomain - -## Verification - -After the fix, verify images are pushed correctly: - -```bash -# Login to Gitea -docker login gitea.harkon.co.uk - -# Check pushed images in Gitea UI -# Navigate to: https://gitea.harkon.co.uk/blue/-/packages -``` - -## Current Build Status - -✅ **Fixed and working!** - -Build command: - -```bash -./scripts/build-and-push-images.sh gitea.harkon.co.uk v1.0.1 harkon -``` - -Expected output: - -``` -ℹ️ Logging in to registry: gitea.harkon.co.uk -Login Succeeded -ℹ️ Building svc-ingestion... -ℹ️ Building: gitea.harkon.co.uk/harkon/svc-ingestion:v1.0.1 -✅ Built: gitea.harkon.co.uk/harkon/svc-ingestion:v1.0.1 -ℹ️ Pushing: gitea.harkon.co.uk/harkon/svc-ingestion:v1.0.1 -✅ Pushed: gitea.harkon.co.uk/harkon/svc-ingestion:v1.0.1 -``` - -## Next Steps - -1. ✅ Build script fixed -2. ✅ Production compose files updated -3. 🟡 Build in progress (14 services) -4. ⏳ Deploy to production (after build completes) - -## References - -- [Gitea Container Registry Documentation](https://docs.gitea.com/usage/packages/container) -- Build script: `scripts/build-and-push-images.sh` -- Production services: `infra/compose/production/services.yaml` diff --git a/docs/OPTIMIZATION_SUMMARY.md b/docs/OPTIMIZATION_SUMMARY.md index 72c01f7..fcf98ba 100644 --- a/docs/OPTIMIZATION_SUMMARY.md +++ b/docs/OPTIMIZATION_SUMMARY.md @@ -148,11 +148,11 @@ docker run --rm gitea.harkon.co.uk/harkon/svc-ocr:v1.0.1 pip list | grep torch ### 5. Update Production Deployment -Update `infra/compose/production/services.yaml` to use `v1.0.1`: +Update `infra/base/services.yaml` to use `v1.0.1`: ```bash # Find and replace v1.0.0 with v1.0.1 -sed -i '' 's/:v1.0.0/:v1.0.1/g' infra/compose/production/services.yaml +sed -i '' 's/:v1.0.0/:v1.0.1/g' infra/base/services.yaml # Or use latest tag (already configured) # No changes needed if using :latest diff --git a/docs/QUICK_START.md b/docs/QUICK_START.md index 24e10c4..c28f99d 100644 --- a/docs/QUICK_START.md +++ b/docs/QUICK_START.md @@ -50,7 +50,7 @@ docker login gitea.harkon.co.uk **SSH to server:** ```bash ssh deploy@141.136.35.199 -cd /opt/compose/ai-tax-agent +cd /opt/ai-tax-agent ``` **Initialize Vault:** @@ -62,19 +62,19 @@ docker exec -it vault vault operator unseal **Create MinIO Buckets:** ```bash -docker exec -it minio mc alias set local http://localhost:9092 admin -docker exec -it minio mc mb local/documents -docker exec -it minio mc mb local/models +docker exec -it apa-minio mc alias set local http://localhost:9000 admin +docker exec -it apa-minio mc mb local/documents +docker exec -it apa-minio mc mb local/models ``` **Create NATS Streams:** ```bash -docker exec -it nats nats stream add TAX_AGENT_EVENTS \ +docker exec -it apa-nats nats stream add TAX_AGENT_EVENTS \\ --subjects="tax.>" --storage=file --retention=limits --max-age=7d ``` **Configure Authentik:** -1. Go to https://authentik.harkon.co.uk +1. Go to https://auth.harkon.co.uk 2. Create groups: `app-admin`, `app-user`, `app-reviewer` 3. Create OAuth providers for: - Review UI: `app.harkon.co.uk` @@ -94,7 +94,7 @@ curl -I https://api.harkon.co.uk/healthz curl -I https://grafana.harkon.co.uk # View logs -./scripts/deploy-to-production.sh logs svc-ingestion +./scripts/deploy-to-production.sh logs apa-svc-ingestion ``` --- @@ -127,8 +127,8 @@ curl -I https://grafana.harkon.co.uk ### Restart Service ```bash ssh deploy@141.136.35.199 -cd /opt/compose/ai-tax-agent -docker compose -f services.yaml restart svc-ingestion +cd /opt/ai-tax-agent +docker compose -f services.yaml restart apa-svc-ingestion ``` ### Check Status @@ -163,25 +163,25 @@ docker compose -f services.yaml logs svc-ingestion docker compose -f infrastructure.yaml ps # Restart -docker compose -f services.yaml restart svc-ingestion +docker compose -f services.yaml restart apa-svc-ingestion ``` ### SSL Issues ```bash # Check Traefik logs -docker logs traefik +docker logs apa-traefik # Check certificates -sudo cat /opt/compose/traefik/certs/godaddy-acme.json | jq +sudo cat /opt/ai-tax-agent/traefik/certs/godaddy-acme.json | jq ``` ### Database Connection ```bash # Test Postgres -docker exec -it postgres pg_isready -U postgres +docker exec -it apa-postgres pg_isready -U postgres # Check env vars -docker exec -it svc-ingestion env | grep POSTGRES +docker exec -it apa-svc-ingestion env | grep POSTGRES ``` --- @@ -190,7 +190,7 @@ docker exec -it svc-ingestion env | grep POSTGRES ```bash ssh deploy@141.136.35.199 -cd /opt/compose/ai-tax-agent +cd /opt/ai-tax-agent # Stop services docker compose -f services.yaml down @@ -198,12 +198,11 @@ docker compose -f infrastructure.yaml down docker compose -f monitoring.yaml down # Restore backup -cd /opt/compose +cd /opt tar -xzf ~/backups/backup-YYYYMMDD-HHMMSS.tar.gz -# Restart company services -cd /opt/compose/traefik && docker compose up -d -cd /opt/compose/authentik && docker compose up -d +# Restart application infra +cd /opt/ai-tax-agent && docker compose -f infrastructure.yaml up -d ``` --- @@ -242,4 +241,3 @@ cd /opt/compose/authentik && docker compose up -d ```bash ./scripts/deploy-to-production.sh logs ``` - diff --git a/libs/config.py b/docs/SRE.md similarity index 100% rename from libs/config.py rename to docs/SRE.md diff --git a/infra/configs/authentik/bootstrap.yaml b/infra/base/authentik/bootstrap.yaml similarity index 100% rename from infra/configs/authentik/bootstrap.yaml rename to infra/base/authentik/bootstrap.yaml diff --git a/infra/configs/loki/loki-config.yml b/infra/base/loki/loki-config.yml similarity index 100% rename from infra/configs/loki/loki-config.yml rename to infra/base/loki/loki-config.yml diff --git a/infra/configs/promtail/promtail-config.yml b/infra/base/promtail/promtail-config.yml similarity index 100% rename from infra/configs/promtail/promtail-config.yml rename to infra/base/promtail/promtail-config.yml diff --git a/infra/base/traefik/config/traefik-dynamic.yml b/infra/base/traefik/config/traefik-dynamic.yml index 12d1a9a..fccc8d6 100644 --- a/infra/base/traefik/config/traefik-dynamic.yml +++ b/infra/base/traefik/config/traefik-dynamic.yml @@ -16,3 +16,49 @@ http: - X-authentik-meta-provider - X-authentik-meta-app - X-authentik-meta-version + + # Large upload middleware for Gitea registry + gitea-large-upload: + buffering: + maxRequestBodyBytes: 5368709120 # 5GB + memRequestBodyBytes: 104857600 # 100MB + maxResponseBodyBytes: 5368709120 # 5GB + memResponseBodyBytes: 104857600 # 100MB + retryExpression: "IsNetworkError() && Attempts() < 3" + + # Rate limiting for public APIs + api-ratelimit: + rateLimit: + average: 100 + burst: 50 + period: 1s + + # Security headers + security-headers: + headers: + frameDeny: true + sslRedirect: true + browserXssFilter: true + contentTypeNosniff: true + stsIncludeSubdomains: true + stsPreload: true + stsSeconds: 31536000 + + # CORS headers + api-cors: + headers: + accessControlAllowMethods: + - GET + - POST + - PUT + - DELETE + - OPTIONS + accessControlAllowOriginList: + - "https://app.harkon.co.uk" + accessControlAllowHeaders: + - "Content-Type" + - "Authorization" + accessControlMaxAge: 100 + addVaryHeader: true + + # Security headers diff --git a/infra/base/traefik/config/traefik.yml b/infra/base/traefik/config/traefik.yml index 03e2050..ac85764 100644 --- a/infra/base/traefik/config/traefik.yml +++ b/infra/base/traefik/config/traefik.yml @@ -4,7 +4,9 @@ entryPoints: address: ":80" websecure: address: ":443" - + transport: + respondingTimeouts: + readTimeout: 30m api: dashboard: true diff --git a/infra/configs/traefik/app-middlewares.yml b/infra/configs/traefik/app-middlewares.yml deleted file mode 100644 index fd19748..0000000 --- a/infra/configs/traefik/app-middlewares.yml +++ /dev/null @@ -1,31 +0,0 @@ -# Application-specific Traefik middlewares -# These are loaded by the application infrastructure, not the external Traefik - -http: - middlewares: - # Large upload middleware for Gitea registry - gitea-large-upload: - buffering: - maxRequestBodyBytes: 5368709120 # 5GB - memRequestBodyBytes: 104857600 # 100MB - maxResponseBodyBytes: 5368709120 # 5GB - memResponseBodyBytes: 104857600 # 100MB - retryExpression: "IsNetworkError() && Attempts() < 3" - - # Rate limiting for public APIs - api-ratelimit: - rateLimit: - average: 100 - burst: 50 - period: 1s - - # Security headers - security-headers: - headers: - frameDeny: true - sslRedirect: true - browserXssFilter: true - contentTypeNosniff: true - stsIncludeSubdomains: true - stsPreload: true - stsSeconds: 31536000 diff --git a/infra/configs/traefik/certs/local.crt b/infra/configs/traefik/certs/local.crt deleted file mode 100644 index e0df05d..0000000 --- a/infra/configs/traefik/certs/local.crt +++ /dev/null @@ -1,25 +0,0 @@ ------BEGIN CERTIFICATE----- -MIIEHjCCAwagAwIBAgIUbOm5g4Xhb08Lk6DIpVst7+xZHOswDQYJKoZIhvcNAQEL -BQAwEDEOMAwGA1UEAwwFbG9jYWwwHhcNMjUwOTI4MTExNTM1WhcNMzUwOTI2MTEx -NTM1WjAQMQ4wDAYDVQQDDAVsb2NhbDCCASIwDQYJKoZIhvcNAQEBBQADggEPADCC -AQoCggEBAK0370DEo3dScS8uLwBsXkuaAHn9wO2fjxEHLZwHWfFo/16t+EEAi5c3 -zDs7nYQ7LPLndxBfO6xZ5uWKNIVtp6ARzAeRbGgbjXDdK3fOyRdhhKR3aZVOH1D0 -xUjEm/X5jEDv81sufSjk+DIQmh8hQnp3RwdHyhkIZUCTsBXMfnj+zs1UKTdRQBF5 -SUplGsbh6z3xCSI4jiNRb7mNHXqV3Fv6ycwF8YdthSDfueltBP4vT/CDtebkkKPF -dx7YWEIPPUNqEoHqeI5iYP6gnWJYcr3vU+p2BuTwUICo+njzAf+P/SsjPHbujJob -dbHUclBHIrIO4BpYZtY1a7E219MbqcECAwEAAaOCAW4wggFqMB0GA1UdDgQWBBQ7 -qHpza0Bb1xI1g7cMBx33JnFQljAfBgNVHSMEGDAWgBQ7qHpza0Bb1xI1g7cMBx33 -JnFQljAPBgNVHRMBAf8EBTADAQH/MIIBFQYDVR0RBIIBDDCCAQiCCWxvY2FsaG9z -dIcEfwAAAYILKi5sb2NhbC5sYW6CDmF1dGgubG9jYWwubGFughFncmFmYW5hLmxv -Y2FsLmxhboIQcmV2aWV3LmxvY2FsLmxhboINYXBpLmxvY2FsLmxhboIPdmF1bHQu -bG9jYWwubGFugg9taW5pby5sb2NhbC5sYW6CE21pbmlvLWFwaS5sb2NhbC5sYW6C -EHFkcmFudC5sb2NhbC5sYW6CD25lbzRqLmxvY2FsLmxhboIUcHJvbWV0aGV1cy5s -b2NhbC5sYW6CDmxva2kubG9jYWwubGFughF1bmxlYXNoLmxvY2FsLmxhboIRdHJh -ZWZpay5sb2NhbC5sYW4wDQYJKoZIhvcNAQELBQADggEBAICf+2MZ7BHbSD/pnvll -G7Zmk+Bntj2F6RBQVZ2ZsKPWkHeZEYJDRvU0I2uL5tvvDJp4q0hjdluJllchhGgr -qfu7i+kRnhzme7oyRTFGYp8b3zHBvLyJLmdIALxuNSjIEeh1Fx0lEhKwqOlA4y6T -jziPmsGv3IonGJM2dURGNcR7DfG6H/Yl12qV8u/tVFTxqWL+hyCE7u8v+ZIcZ+fj -82X7hXt1HvfP84EhVtfqQMb5xykLtXvPqggSCFXYIj2PanWdwEdE6P5Yr2D1Yz7r -tzpmpoetrGoMWIeB0yiWgt0qJ/KK7meoCp64mqfBc48p1p/7kj2R/FRH1Jx3gFWy -dT4= ------END CERTIFICATE----- diff --git a/infra/configs/traefik/certs/local.key b/infra/configs/traefik/certs/local.key deleted file mode 100644 index 16ae358..0000000 --- a/infra/configs/traefik/certs/local.key +++ /dev/null @@ -1,28 +0,0 @@ ------BEGIN PRIVATE KEY----- -MIIEvAIBADANBgkqhkiG9w0BAQEFAASCBKYwggSiAgEAAoIBAQCtN+9AxKN3UnEv -Li8AbF5LmgB5/cDtn48RBy2cB1nxaP9erfhBAIuXN8w7O52EOyzy53cQXzusWebl -ijSFbaegEcwHkWxoG41w3St3zskXYYSkd2mVTh9Q9MVIxJv1+YxA7/NbLn0o5Pgy -EJofIUJ6d0cHR8oZCGVAk7AVzH54/s7NVCk3UUAReUlKZRrG4es98QkiOI4jUW+5 -jR16ldxb+snMBfGHbYUg37npbQT+L0/wg7Xm5JCjxXce2FhCDz1DahKB6niOYmD+ -oJ1iWHK971Pqdgbk8FCAqPp48wH/j/0rIzx27oyaG3Wx1HJQRyKyDuAaWGbWNWux -NtfTG6nBAgMBAAECggEAHvtkNcd2HX+HcxLloUPA0fDnqOo0OcxSQI9yHvhJpB5N -nterEaVRUmjOhMGy+NXEwmWYLDt8ZuVloSTJJBxq4PyN68SdCTn0YH2Oqs03tpDg -srIRFn10qHw/VTalVqed6HeCpYp5JHlf00SY7Hx8cX8oGytCAJw50AUad6ut62IM -sp/QFdtkLhtq9vGzQUqyIP92Y/+GbxhB+eHkuvvFau1KJq7K8qhroFTwQFts9er2 -890Ujmz3bF2RhHixQcpXpsf/DMyylGJTbZDmSFkTDa/c1PzqvKrmL3wP7A3bk1E5 -CP8/a65ykotJEX8RkWqH2XxvRKpdWtCaeuCsmWUQ4QKBgQDTLbC9DWHCUYMWJhyW -TKAeXx5xFGHIqggN28lIkXFiCVsTZyOuRDN7Q/CbOat/0JthrzyP18L+6ewZt2ZN -RjdfGdnpUCJx6LR4dtBH8Rc+CjlSnqEgJIkgfIs8b9uEhMI1eQV+BAFQON3BzdpT -wQ86aGsrdqtpfav7cImVfGcY/QKBgQDR+7OcnEwh8s/1J2niMKjk8agyCGGHWW4M -g+vIv7lptavgEGOPMBv7QgmeuUjwSszphQXL36m39ZRmI5B+J0/onuQzv04tJeZY -WZhA+T12a+1VnvUZNZm/qp0I2rW+4m+DmJoLQlvpaaFit/1fPJ6+IzI2VzPeWhw2 -vUQ5QIYhFQKBgFUWZc3mpGsNOMol1QLiIOnb3YImejfF+rTKx9FLeOnNZzrsJb5D -kJKsDzgcBnPbc5/qYXZ7sv/O9OhvsvKTxh+1ZM3TEe3fm0emZ8l05K6EpBAcBkPT -NMU4KUnSsBo2+6Fb/9CEgJr4LrG15bA1a5NXG0dJ60r37eHDuEvY8hlpAoGADWv2 -PhNrdlwL2NKtHO0ZTpD3vEL24OzhcOFZx9ohYtVe6BKEGpnrn/LHpKKZO+q8EE0V -YsOoGH8U/jZVvQqMPAUz9u7Kc25Ru+H2Lmj/+brKT8e6SOM5MZwZL4CzT0Ev+Yxe -hEu4jkHXM/Uot9arGuIrCngmc5b06LbOTo6GREUCgYArWyPYeETah/GVwU7/TNY5 -5f8lNbWBoXZfpVbWdoUZT6tGWciZsiXSR4x9f+1/LMIuChegSEazrJUDt7TbCkZs -s4A66pnME37aYP2sMvJF3zSnQWVIyBgGI5xX0XW/WdozKl1mdFfigyWp58uo2dS2 -TxE3dy8rxpUdDCUmvJT/Fw== ------END PRIVATE KEY----- diff --git a/libs/neo/client.py b/libs/neo/client.py index 315c44a..9b238d0 100644 --- a/libs/neo/client.py +++ b/libs/neo/client.py @@ -134,7 +134,7 @@ class Neo4jClient: result = await self.run_query(query, {"properties": properties}, database) node = result[0]["n"] if result else {} # Return node ID if available, otherwise return the full node - return node.get("id", node) + return node.get("id", node) # type: ignore async def update_node( self, @@ -209,7 +209,7 @@ class Neo4jClient: database, ) rel = result[0]["r"] if result else {} - return rel.get("id", rel) + return rel.get("id", rel) # type: ignore # Original signature (using labels and IDs) rel_properties = properties or {} @@ -231,7 +231,7 @@ class Neo4jClient: ) rel = result[0]["r"] if result else {} # Return relationship ID if available, otherwise return the full relationship - return rel.get("id", rel) + return rel.get("id", rel) # type: ignore async def get_node_lineage( self, node_id: str, max_depth: int = 10, database: str = "neo4j" diff --git a/infra/configs/traefik/certs/acme.json b/libs/ocr/__init__.py similarity index 100% rename from infra/configs/traefik/certs/acme.json rename to libs/ocr/__init__.py diff --git a/libs/ocr/processor.py b/libs/ocr/processor.py new file mode 100644 index 0000000..05e40b4 --- /dev/null +++ b/libs/ocr/processor.py @@ -0,0 +1,507 @@ +import base64 +import concurrent.futures +import io +import json +import os +from pathlib import Path +from typing import Any + +import numpy as np +import requests +from PIL import Image, ImageFilter +from PyPDF2 import PdfReader + + +class OCRProcessor: + def __init__( + self, + model_name: str = "llama3.2-vision:11b", + base_url: str = "http://localhost:11434/api/generate", + max_workers: int = 1, + provider: str = "ollama", + openai_api_key: str | None = None, + openai_base_url: str = "https://api.openai.com/v1/chat/completions", + ): + self.model_name = model_name + self.base_url = base_url + self.max_workers = max_workers + self.provider = provider.lower() + self.openai_api_key = openai_api_key or os.getenv("OPENAI_API_KEY") + self.openai_base_url = openai_base_url + + def _encode_image(self, image_path: str) -> str: + """Convert image to base64 string""" + with open(image_path, "rb") as image_file: + return base64.b64encode(image_file.read()).decode("utf-8") + + def _pdf_to_images(self, pdf_path: str) -> list[str]: + """ + Convert each page of a PDF to an image without PyMuPDF. + Strategy: extract largest embedded image per page via PyPDF2. + Saves each selected image as a temporary PNG and returns paths. + + Note: Text-only pages with no embedded images will be skipped here. + Use _pdf_extract_text as a fallback for such pages. + """ + image_paths: list[str] = [] + try: + reader = PdfReader(pdf_path) + for page_index, page in enumerate(reader.pages): + try: + resources = page.get("/Resources") + if resources is None: + continue + xobject = resources.get("/XObject") + if xobject is None: + continue + xobject = xobject.get_object() + largest = None + largest_area = -1 + for _, obj_ref in xobject.items(): + try: + obj = obj_ref.get_object() + if obj.get("/Subtype") != "/Image": + continue + width = int(obj.get("/Width", 0)) + height = int(obj.get("/Height", 0)) + area = width * height + if area > largest_area: + largest = obj + largest_area = area + except Exception: + continue + + if largest is None: + continue + + data = largest.get_data() + filt = largest.get("/Filter") + out_path = f"{pdf_path}_page{page_index}.png" + # If JPEG/JPX, write bytes directly; else convert via PIL + if filt in ("/DCTDecode",): + # JPEG + out_path = f"{pdf_path}_page{page_index}.jpg" + with open(out_path, "wb") as f: + f.write(data) + elif filt in ("/JPXDecode",): + out_path = f"{pdf_path}_page{page_index}.jp2" + with open(out_path, "wb") as f: + f.write(data) + else: + mode = "RGB" + colorspace = largest.get("/ColorSpace") + if colorspace in ("/DeviceGray",): + mode = "L" + width = int(largest.get("/Width", 0)) + height = int(largest.get("/Height", 0)) + try: + img = Image.frombytes(mode, (width, height), data) + except Exception: + # Best-effort decode via Pillow + img = Image.open(io.BytesIO(data)) + img.save(out_path, format="PNG") + + image_paths.append(out_path) + except Exception: + # Continue gracefully for problematic pages/objects + continue + return image_paths + except Exception as e: + raise ValueError(f"Could not extract images from PDF: {e}") + + def _pdf_extract_text(self, pdf_path: str) -> list[str]: + """Extract text per page using pdfplumber if available, else PyPDF2.""" + texts: list[str] = [] + try: + try: + import pdfplumber + + with pdfplumber.open(pdf_path) as pdf: + for page in pdf.pages: + texts.append(page.extract_text() or "") + return texts + except Exception: + # Fallback to PyPDF2 + reader = PdfReader(pdf_path) + for page in reader.pages: # type: ignore + texts.append(page.extract_text() or "") + return texts + except Exception as e: + raise ValueError(f"Could not extract text from PDF: {e}") + + def _call_ollama_vision(self, prompt: str, image_base64: str) -> str: + payload = { + "model": self.model_name, + "prompt": prompt, + "stream": False, + "images": [image_base64], + } + response = requests.post(self.base_url, json=payload) + response.raise_for_status() + return response.json().get("response", "") # type: ignore + + def _call_openai_vision(self, prompt: str, image_base64: str) -> str: + if not self.openai_api_key: + raise ValueError("OPENAI_API_KEY not set") + # Compose chat.completions payload for GPT-4o/mini vision + payload = { + "model": self.model_name or "gpt-4o-mini", + "messages": [ + { + "role": "user", + "content": [ + {"type": "text", "text": prompt}, + { + "type": "image_url", + "image_url": { + "url": f"data:image/jpeg;base64,{image_base64}", + }, + }, + ], + } + ], + "temperature": 0, + } + headers = { + "Authorization": f"Bearer {self.openai_api_key}", + "Content-Type": "application/json", + } + response = requests.post(self.openai_base_url, headers=headers, json=payload) + response.raise_for_status() + data = response.json() + try: + return data["choices"][0]["message"]["content"] # type: ignore + except Exception: + return json.dumps(data) + + def _preprocess_image(self, image_path: str, language: str = "en") -> str: + """ + Preprocess image before OCR using Pillow + NumPy: + - Convert to grayscale + - Histogram equalization (contrast) + - Median denoise + - Otsu threshold and invert + """ + try: + with Image.open(image_path) as img: + if img.mode in ("RGBA", "LA"): + img = img.convert("RGB") + gray = img.convert("L") + + # Histogram equalization via cumulative distribution + arr = np.asarray(gray) + hist, _ = np.histogram(arr.flatten(), 256, [0, 256]) # type: ignore + cdf = hist.cumsum() + cdf_masked = np.ma.masked_equal(cdf, 0) # type: ignore + cdf_min = cdf_masked.min() if cdf_masked.size else 0 + cdf_max = cdf_masked.max() if cdf_masked.size else 0 + if cdf_max == cdf_min: + eq = arr + else: + cdf_scaled = (cdf_masked - cdf_min) * 255 / (cdf_max - cdf_min) + lut = np.ma.filled(cdf_scaled, 0).astype("uint8") + eq = lut[arr] + + eq_img = Image.fromarray(eq, mode="L") + # Median filter (3x3) to reduce noise + eq_img = eq_img.filter(ImageFilter.MedianFilter(size=3)) + arr_eq = np.asarray(eq_img) + + # Otsu threshold + hist2, _ = np.histogram(arr_eq, 256, [0, 256]) # type: ignore + total = arr_eq.size + sum_total = (np.arange(256) * hist2).sum() + sum_b = 0.0 + w_b = 0.0 + max_var = 0.0 + thr = 0 + for t in range(256): + w_b += hist2[t] + if w_b == 0: + continue + w_f = total - w_b + if w_f == 0: + break + sum_b += t * hist2[t] + m_b = sum_b / w_b + m_f = (sum_total - sum_b) / w_f + var_between = w_b * w_f * (m_b - m_f) ** 2 + if var_between > max_var: + max_var = var_between + thr = t + + binary = (arr_eq > thr).astype(np.uint8) * 255 + # Invert: black text on white background + binary = 255 - binary + + out_img = Image.fromarray(binary, mode="L") + preprocessed_path = f"{image_path}_preprocessed.jpg" + out_img.save(preprocessed_path, format="JPEG", quality=95) + return preprocessed_path + except Exception as e: + raise ValueError(f"Failed to preprocess image {image_path}: {e}") + + def process_image( + self, + image_path: str, + format_type: str = "markdown", + preprocess: bool = True, + custom_prompt: str | None = None, + language: str = "en", + ) -> str: + """ + Process an image (or PDF) and extract text in the specified format + + Args: + image_path: Path to the image file or PDF file + format_type: One of ["markdown", "text", "json", "structured", "key_value","custom"] + preprocess: Whether to apply image preprocessing + custom_prompt: If provided, this prompt overrides the default based on format_type + language: Language code to apply language specific OCR preprocessing + """ + try: + # If the input is a PDF, process all pages + if image_path.lower().endswith(".pdf"): + image_pages = self._pdf_to_images(image_path) + responses: list[str] = [] + if image_pages: + for idx, page_file in enumerate(image_pages): + # Process each page with preprocessing if enabled + if preprocess: + preprocessed_path = self._preprocess_image( + page_file, language + ) + else: + preprocessed_path = page_file + + image_base64 = self._encode_image(preprocessed_path) + + if custom_prompt and custom_prompt.strip(): + prompt = custom_prompt + else: + prompts = { + "markdown": f"""Extract all text content from this image in {language} **exactly as it appears**, without modification, summarization, or omission. + Format the output in markdown: + - Use headers (#, ##, ###) **only if they appear in the image** + - Preserve original lists (-, *, numbered lists) as they are + - Maintain all text formatting (bold, italics, underlines) exactly as seen + - **Do not add, interpret, or restructure any content** + """, + "text": f"""Extract all visible text from this image in {language} **without any changes**. + - **Do not summarize, paraphrase, or infer missing text.** + - Retain all spacing, punctuation, and formatting exactly as in the image. + - If text is unclear or partially visible, extract as much as possible without guessing. + - **Include all text, even if it seems irrelevant or repeated.** + """, + "json": f"""Extract all text from this image in {language} and format it as JSON, **strictly preserving** the structure. + - **Do not summarize, add, or modify any text.** + - Maintain hierarchical sections and subsections as they appear. + - Use keys that reflect the document's actual structure (e.g., "title", "body", "footer"). + - Include all text, even if fragmented, blurry, or unclear. + """, + "structured": f"""Extract all text from this image in {language}, **ensuring complete structural accuracy**: + - Identify and format tables **without altering content**. + - Preserve list structures (bulleted, numbered) **exactly as shown**. + - Maintain all section headings, indents, and alignments. + - **Do not add, infer, or restructure the content in any way.** + """, + "key_value": f"""Extract all key-value pairs from this image in {language} **exactly as they appear**: + - Identify and extract labels and their corresponding values without modification. + - Maintain the exact wording, punctuation, and order. + - Format each pair as 'key: value' **only if clearly structured that way in the image**. + - **Do not infer missing values or add any extra text.** + """, + "table": f"""Extract all tabular data from this image in {language} **exactly as it appears**, without modification, summarization, or omission. + - **Preserve the table structure** (rows, columns, headers) as closely as possible. + - **Do not add missing values or infer content**—if a cell is empty, leave it empty. + - Maintain all numerical, textual, and special character formatting. + - If the table contains merged cells, indicate them clearly without altering their meaning. + - Output the table in a structured format such as Markdown, CSV, or JSON, based on the intended use. + """, + } + prompt = prompts.get(format_type, prompts["text"]) + + # Route to chosen provider + if self.provider == "openai": + res = self._call_openai_vision(prompt, image_base64) + else: + res = self._call_ollama_vision(prompt, image_base64) + + responses.append(f"Page {idx + 1}:\n{res}") + + # Clean up temporary files + if preprocess and preprocessed_path.endswith( + "_preprocessed.jpg" + ): + try: + os.remove(preprocessed_path) + except OSError: + pass + if page_file.endswith((".png", ".jpg", ".jp2")): + try: + os.remove(page_file) + except OSError: + pass + + final_result = "\n".join(responses) + if format_type == "json": + try: + json_data = json.loads(final_result) + return json.dumps(json_data, indent=2) + except json.JSONDecodeError: + return final_result + return final_result + else: + # Fallback: no images found; extract raw text per page + text_pages = self._pdf_extract_text(image_path) + combined = [] + for i, t in enumerate(text_pages): + combined.append(f"Page {i + 1}:\n{t}") + return "\n".join(combined) + + # Process non-PDF images as before. + if preprocess: + image_path = self._preprocess_image(image_path, language) + + image_base64 = self._encode_image(image_path) + + # Clean up temporary files + if image_path.endswith(("_preprocessed.jpg", "_temp.jpg")): + os.remove(image_path) + + if custom_prompt and custom_prompt.strip(): + prompt = custom_prompt + print("Using custom prompt:", prompt) + else: + prompts = { + "markdown": f"""Extract all text content from this image in {language} **exactly as it appears**, without modification, summarization, or omission. + Format the output in markdown: + - Use headers (#, ##, ###) **only if they appear in the image** + - Preserve original lists (-, *, numbered lists) as they are + - Maintain all text formatting (bold, italics, underlines) exactly as seen + - **Do not add, interpret, or restructure any content** + """, + "text": f"""Extract all visible text from this image in {language} **without any changes**. + - **Do not summarize, paraphrase, or infer missing text.** + - Retain all spacing, punctuation, and formatting exactly as in the image. + - If text is unclear or partially visible, extract as much as possible without guessing. + - **Include all text, even if it seems irrelevant or repeated.** + """, + "json": f"""Extract all text from this image in {language} and format it as JSON, **strictly preserving** the structure. + - **Do not summarize, add, or modify any text.** + - Maintain hierarchical sections and subsections as they appear. + - Use keys that reflect the document's actual structure (e.g., "title", "body", "footer"). + - Include all text, even if fragmented, blurry, or unclear. + """, + "structured": f"""Extract all text from this image in {language}, **ensuring complete structural accuracy**: + - Identify and format tables **without altering content**. + - Preserve list structures (bulleted, numbered) **exactly as shown**. + - Maintain all section headings, indents, and alignments. + - **Do not add, infer, or restructure the content in any way.** + """, + "key_value": f"""Extract all key-value pairs from this image in {language} **exactly as they appear**: + - Identify and extract labels and their corresponding values without modification. + - Maintain the exact wording, punctuation, and order. + - Format each pair as 'key: value' **only if clearly structured that way in the image**. + - **Do not infer missing values or add any extra text.** + """, + "table": f"""Extract all tabular data from this image in {language} **exactly as it appears**, without modification, summarization, or omission. + - **Preserve the table structure** (rows, columns, headers) as closely as possible. + - **Do not add missing values or infer content**—if a cell is empty, leave it empty. + - Maintain all numerical, textual, and special character formatting. + - If the table contains merged cells, indicate them clearly without altering their meaning. + - Output the table in a structured format such as Markdown, CSV, or JSON, based on the intended use. + """, + } + prompt = prompts.get(format_type, prompts["text"]) + print("Using default prompt:", prompt) # Debug print + + # Call chosen provider with single image + if self.provider == "openai": + result = self._call_openai_vision(prompt, image_base64) + else: + result = self._call_ollama_vision(prompt, image_base64) + + if format_type == "json": + try: + json_data = json.loads(result) + return json.dumps(json_data, indent=2) + except json.JSONDecodeError: + return str(result) + + return str(result) + except Exception as e: + return f"Error processing image: {str(e)}" + + def process_batch( + self, + input_path: str | list[str], + format_type: str = "markdown", + recursive: bool = False, + preprocess: bool = True, + custom_prompt: str | None = None, + language: str = "en", + ) -> dict[str, Any]: + """ + Process multiple images in batch + + Args: + input_path: Path to directory or list of image paths + format_type: Output format type + recursive: Whether to search directories recursively + preprocess: Whether to apply image preprocessing + custom_prompt: If provided, this prompt overrides the default for each image + language: Language code to apply language specific OCR preprocessing + + Returns: + Dictionary with results and statistics + """ + # Collect all image paths + image_paths: list[str | Path] = [] + if isinstance(input_path, str): + base_path = Path(input_path) + if base_path.is_dir(): + pattern = "**/*" if recursive else "*" + for ext in [".png", ".jpg", ".jpeg", ".pdf", ".tiff"]: + image_paths.extend(base_path.glob(f"{pattern}{ext}")) + else: + image_paths = [base_path] + else: + image_paths = [Path(p) for p in input_path] + + results = {} + errors = {} + + # Process images in parallel + with concurrent.futures.ThreadPoolExecutor( + max_workers=self.max_workers + ) as executor: + future_to_path = { + executor.submit( + self.process_image, + str(path), + format_type, + preprocess, + custom_prompt, + language, + ): path + for path in image_paths + } + + for future in concurrent.futures.as_completed(future_to_path): + path = future_to_path[future] + try: + results[str(path)] = future.result() + except Exception as e: + errors[str(path)] = str(e) + # pbar.update(1) + + return { + "results": results, + "errors": errors, + "statistics": { + "total": len(image_paths), + "successful": len(results), + "failed": len(errors), + }, + } diff --git a/libs/requirements-base.txt b/libs/requirements-base.txt index 25ee784..4e2efc7 100644 --- a/libs/requirements-base.txt +++ b/libs/requirements-base.txt @@ -1,13 +1,13 @@ # Core framework dependencies (Required by all services) -fastapi>=0.118.0 +fastapi>=0.119.0 uvicorn[standard]>=0.37.0 -pydantic>=2.11.9 +pydantic>=2.12.0 pydantic-settings>=2.11.0 # Database drivers (lightweight) -sqlalchemy>=2.0.43 +sqlalchemy>=2.0.44 asyncpg>=0.30.0 -psycopg2-binary>=2.9.10 +psycopg2-binary>=2.9.11 neo4j>=6.0.2 redis[hiredis]>=6.4.0 diff --git a/libs/requirements-pdf.txt b/libs/requirements-pdf.txt index 120bc78..ed50ce9 100644 --- a/libs/requirements-pdf.txt +++ b/libs/requirements-pdf.txt @@ -3,3 +3,4 @@ pdfrw>=0.4 reportlab>=4.4.4 PyPDF2>=3.0.1 pdfplumber>=0.11.7 +opencv-python diff --git a/libs/storage/client.py b/libs/storage/client.py index 251224d..a2659b9 100644 --- a/libs/storage/client.py +++ b/libs/storage/client.py @@ -79,7 +79,7 @@ class StorageClient: """Download object from bucket""" try: response = self.client.get_object(bucket_name, object_name) - data = response.read() + data: bytes = response.read() response.close() response.release_conn() @@ -89,7 +89,7 @@ class StorageClient: object=object_name, size=len(data), ) - return data # type: ignore + return data except S3Error as e: logger.error( diff --git a/mypy.ini b/mypy.ini index e26a471..86f082c 100644 --- a/mypy.ini +++ b/mypy.ini @@ -18,3 +18,7 @@ disallow_untyped_defs = False [mypy-minio.*] ignore_missing_imports = True + +[mypy-pytesseract.*] +follow_untyped_imports = True +ignore_missing_imports = True diff --git a/pyproject.toml b/pyproject.toml index c37e636..6267078 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -54,11 +54,20 @@ dependencies = [ "pytesseract>=0.3.10", "Pillow>=10.1.0", "playwright>=1.40.0", - "pyshaql>=0.25.0", + "pyshacl>=0.25.0", "rdflib>=7.0.0", "spacy>=3.7.0", "presidio-analyzer>=2.2.0", "presidio-anonymizer>=2.2.0", + "jsonschema>=4.0.0", + "boto3>=1.0.0", + "aiokafka>=0.8.0", + "hvac>=1.0.0", + "nats-py>=2.0.0", + "pydantic-settings>=2.0.0", + "opentelemetry-exporter-otlp>=1.0.0", + "opentelemetry-instrumentation-psycopg2>=0.42b0", + "opentelemetry-instrumentation-redis>=0.42b0", ] [project.optional-dependencies] diff --git a/requirements.txt b/requirements.txt index aa887bc..1edb0ea 100644 --- a/requirements.txt +++ b/requirements.txt @@ -56,6 +56,10 @@ numpy>=2.3.3 # PDF processing pdfrw>=0.4 reportlab>=4.4.4 +PyPDF2>=3.0.1 +pdf2image>=1.17.0 +pytesseract>=0.3.10 +Pillow>=10.3.0 # Date and time utilities python-dateutil>=2.9.0 @@ -94,3 +98,4 @@ black>=25.9.0 isort>=6.0.1 bandit>=1.8.6 safety>=3.6.2 +opencv-python diff --git a/scripts/deploy-to-production.sh b/scripts/deploy-to-production.sh index 8541ad5..de6026d 100644 --- a/scripts/deploy-to-production.sh +++ b/scripts/deploy-to-production.sh @@ -7,9 +7,9 @@ set -e # Configuration REMOTE_HOST="deploy@141.136.35.199" -REMOTE_PATH="/opt/compose/ai-tax-agent" -LOCAL_COMPOSE_PATH="infra/compose/production" -ENV_FILE="infra/compose/.env.production" +REMOTE_PATH="/opt/ai-tax-agent" +LOCAL_COMPOSE_PATH="infra/base" +ENV_FILE="infra/environments/production/.env" # Colors for output RED='\033[0;31m' @@ -66,13 +66,15 @@ backup_remote() { ssh $REMOTE_HOST << 'EOF' set -e mkdir -p ~/backups - cd /opt/compose + cd /opt - # Backup compose directory (exclude large cert files) - tar -czf ~/backups/backup-$(date +%Y%m%d-%H%M%S).tar.gz \ - --exclude='./traefik/certs/godaddy-acme.json' \ - --exclude='./*/node_modules' \ - . + # Backup application directory (exclude large cert files) + if [ -d ai-tax-agent ]; then + tar -czf ~/backups/backup-$(date +%Y%m%d-%H%M%S).tar.gz \ + --exclude='./traefik/certs/godaddy-acme.json' \ + --exclude='./*/node_modules' \ + ai-tax-agent + fi # Document current state docker ps > ~/backups/current-services-$(date +%Y%m%d-%H%M%S).txt @@ -100,6 +102,9 @@ prepare_remote() { mkdir -p $REMOTE_PATH/grafana/provisioning mkdir -p $REMOTE_PATH/grafana/dashboards mkdir -p $REMOTE_PATH/loki + mkdir -p $REMOTE_PATH/promtail + mkdir -p $REMOTE_PATH/traefik/config + mkdir -p $REMOTE_PATH/authentik echo "Directory structure created" ls -la $REMOTE_PATH @@ -110,7 +115,7 @@ EOF # Copy files to remote server copy_files() { - log_info "Copying compose files to remote server..." + log_info "Copying base compose files and configs to remote server..." # Copy compose files scp $LOCAL_COMPOSE_PATH/infrastructure.yaml $REMOTE_HOST:$REMOTE_PATH/ @@ -121,10 +126,13 @@ copy_files() { scp $ENV_FILE $REMOTE_HOST:$REMOTE_PATH/.env # Copy configuration files - scp -r infra/compose/prometheus/* $REMOTE_HOST:$REMOTE_PATH/prometheus/ - scp -r infra/compose/grafana/provisioning/* $REMOTE_HOST:$REMOTE_PATH/grafana/provisioning/ - scp -r infra/compose/grafana/dashboards/* $REMOTE_HOST:$REMOTE_PATH/grafana/dashboards/ - scp -r infra/compose/loki/* $REMOTE_HOST:$REMOTE_PATH/loki/ + scp -r $LOCAL_COMPOSE_PATH/prometheus/* $REMOTE_HOST:$REMOTE_PATH/prometheus/ + scp -r $LOCAL_COMPOSE_PATH/grafana/provisioning/* $REMOTE_HOST:$REMOTE_PATH/grafana/provisioning/ + scp -r $LOCAL_COMPOSE_PATH/grafana/dashboards/* $REMOTE_HOST:$REMOTE_PATH/grafana/dashboards/ + scp -r $LOCAL_COMPOSE_PATH/loki/* $REMOTE_HOST:$REMOTE_PATH/loki/ + scp -r $LOCAL_COMPOSE_PATH/promtail/* $REMOTE_HOST:$REMOTE_PATH/promtail/ 2>/dev/null || true + scp -r $LOCAL_COMPOSE_PATH/traefik/config/* $REMOTE_HOST:$REMOTE_PATH/traefik/config/ 2>/dev/null || true + scp -r $LOCAL_COMPOSE_PATH/authentik/* $REMOTE_HOST:$REMOTE_PATH/authentik/ 2>/dev/null || true log_success "Files copied to remote server" } diff --git a/tests/e2e/test_happy_path.py b/tests/e2e/test_happy_path.py index 4c2b3e5..76ccf1d 100644 --- a/tests/e2e/test_happy_path.py +++ b/tests/e2e/test_happy_path.py @@ -1,555 +1,4 @@ -# ROLE +import pytest -You are a **Senior Platform Engineer + Backend Lead** generating **production code** and **ops assets** for a microservice suite that powers an accounting Knowledge Graph + Vector RAG platform. Authentication/authorization are centralized at the **edge via Traefik + Authentik** (ForwardAuth). **Services are trust-bound** to Traefik and consume user/role claims via forwarded headers/JWT. - -# MISSION - -Produce fully working code for **all application services** (FastAPI + Python 3.12) with: - -- Solid domain models, Pydantic v2 schemas, type hints, strict mypy, ruff lint. -- Opentelemetry tracing, Prometheus metrics, structured logging. -- Vault-backed secrets, MinIO S3 client, Qdrant client, Neo4j driver, Postgres (SQLAlchemy), Redis. -- Eventing (Kafka or SQS/SNS behind an interface). -- Deterministic data contracts, end-to-end tests, Dockerfiles, Compose, CI for Gitea. -- Traefik labels + Authentik Outpost integration for every exposed route. -- Zero PII in vectors (Qdrant), evidence-based lineage in KG, and bitemporal writes. - -# GLOBAL CONSTRAINTS (APPLY TO ALL SERVICES) - -- **Language & Runtime:** Python **3.12**. -- **Frameworks:** FastAPI, Pydantic v2, SQLAlchemy 2, httpx, aiokafka or boto3 (pluggable), redis-py, opentelemetry-instrumentation-fastapi, prometheus-fastapi-instrumentator. -- **Config:** `pydantic-settings` with `.env` overlay. Provide `Settings` class per service. -- **Secrets:** HashiCorp **Vault** (AppRole/JWT). Use Vault Transit to **envelope-encrypt** sensitive fields before persistence (helpers provided in `lib/security.py`). -- **Auth:** No OIDC in services. Add `TrustedProxyMiddleware`: - - - Reject if request not from internal network (configurable CIDR). - - Require headers set by Traefik+Authentik (`X-Authenticated-User`, `X-Authenticated-Email`, `X-Authenticated-Groups`, `Authorization: Bearer …`). - - Parse groups → `roles` list on `request.state`. - -- **Observability:** - - - OpenTelemetry (traceparent propagation), span attrs (service, route, user, tenant). - - Prometheus metrics endpoint `/metrics` protected by internal network check. - - Structured JSON logs (timestamp, level, svc, trace_id, msg) via `structlog`. - -- **Errors:** Global exception handler → RFC7807 Problem+JSON (`type`, `title`, `status`, `detail`, `instance`, `trace_id`). -- **Testing:** `pytest`, `pytest-asyncio`, `hypothesis` (property tests for calculators), `coverage ≥ 90%` per service. -- **Static:** `ruff`, `mypy --strict`, `bandit`, `safety`, `licensecheck`. -- **Perf:** Each service exposes `/healthz`, `/readyz`, `/livez`; cold start < 500ms; p95 endpoint < 250ms (local). -- **Containers:** Distroless or slim images; non-root user; read-only FS; `/tmp` mounted for OCR where needed. -- **Docs:** OpenAPI JSON + ReDoc; MkDocs site with service READMEs. - -# SHARED LIBS (GENERATE ONCE, REUSE) - -Create `libs/` used by all services: - -- `libs/config.py` – base `Settings`, env parsing, Vault client factory, MinIO client factory, Qdrant client factory, Neo4j driver factory, Redis factory, Kafka/SQS client factory. -- `libs/security.py` – Vault Transit helpers (`encrypt_field`, `decrypt_field`), header parsing, internal-CIDR validator. -- `libs/observability.py` – otel init, prometheus instrumentor, logging config. -- `libs/events.py` – abstract `EventBus` with `publish(topic, payload: dict)`, `subscribe(topic, handler)`. Two impls: Kafka (`aiokafka`) and SQS/SNS (`boto3`). -- `libs/schemas.py` – **canonical Pydantic models** shared across services (Document, Evidence, IncomeItem, etc.) mirroring the ontology schemas. Include JSONSchema exports. -- `libs/storage.py` – S3/MinIO helpers (bucket ensure, put/get, presigned). -- `libs/neo.py` – Neo4j session helpers, Cypher runner with retry, SHACL validator invoker (pySHACL on exported RDF). -- `libs/rag.py` – Qdrant collections CRUD, hybrid search (dense+sparse), rerank wrapper, de-identification utilities (regex + NER; hash placeholders). -- `libs/forms.py` – PDF AcroForm fill via `pdfrw` with overlay fallback via `reportlab`. -- `libs/calibration.py` – `calibrated_confidence(raw_score, method="temperature_scaling", params=...)`. - -# EVENT TOPICS (STANDARDIZE) - -- `doc.ingested`, `doc.ocr_ready`, `doc.extracted`, `kg.upserted`, `rag.indexed`, `calc.schedule_ready`, `form.filled`, `hmrc.submitted`, `review.requested`, `review.completed`, `firm.sync.completed` - -Each payload MUST include: `event_id (ulid)`, `occurred_at (iso)`, `actor`, `tenant_id`, `trace_id`, `schema_version`, and a `data` object (service-specific). - -# TRUST HEADERS FROM TRAEFIK + AUTHENTIK (USE EXACT KEYS) - -- `X-Authenticated-User` (string) -- `X-Authenticated-Email` (string) -- `X-Authenticated-Groups` (comma-separated) -- `Authorization` (`Bearer ` from Authentik) - Reject any request missing these (except `/healthz|/readyz|/livez|/metrics` from internal CIDR). - ---- - -## SERVICES TO IMPLEMENT (CODE FOR EACH) - -### 1) `svc-ingestion` - -**Purpose:** Accept uploads or URLs, checksum, store to MinIO, emit `doc.ingested`. - -**Endpoints:** - -- `POST /v1/ingest/upload` (multipart file, metadata: `tenant_id`, `kind`, `source`) → `{doc_id, s3_url, checksum}` -- `POST /v1/ingest/url` (json: `{url, kind, tenant_id}`) → downloads to MinIO -- `GET /v1/docs/{doc_id}` → metadata - -**Logic:** - -- Compute SHA256, dedupe by checksum; MinIO path `tenants/{tenant_id}/raw/{doc_id}.pdf`. -- Store metadata in Postgres table `ingest_documents` (alembic migrations). -- Publish `doc.ingested` with `{doc_id, bucket, key, pages?, mime}`. - -**Env:** `S3_BUCKET_RAW`, `MINIO_*`, `DB_URL`. - -**Traefik labels:** route `/ingest/*`. - ---- - -### 2) `svc-rpa` - -**Purpose:** Scheduled RPA pulls from firm/client portals via Playwright. - -**Tasks:** - -- Playwright login flows (credentials from Vault), 2FA via Authentik OAuth device or OTP secret in Vault. -- Download statements/invoices; hand off to `svc-ingestion` via internal POST. -- Prefect flows: `pull_portal_X()`, `pull_portal_Y()` with schedules. - -**Endpoints:** - -- `POST /v1/rpa/run/{connector}` (manual trigger) -- `GET /v1/rpa/status/{run_id}` - -**Env:** `VAULT_ADDR`, `VAULT_ROLE_ID`, `VAULT_SECRET_ID`. - ---- - -### 3) `svc-ocr` - -**Purpose:** OCR & layout extraction. - -**Pipeline:** - -- Pull object from MinIO, detect rotation/de-skew (`opencv-python`), split pages (`pymupdf`), OCR (`pytesseract`) or bypass if text layer present (`pdfplumber`). -- Output per-page text + **bbox** for lines/words. -- Write JSON to MinIO `tenants/{tenant_id}/ocr/{doc_id}.json` and emit `doc.ocr_ready`. - -**Endpoints:** - -- `POST /v1/ocr/{doc_id}` (idempotent trigger) -- `GET /v1/ocr/{doc_id}` (fetch OCR JSON) - -**Env:** `TESSERACT_LANGS`, `S3_BUCKET_EVIDENCE`. - ---- - -### 4) `svc-extract` - -**Purpose:** Classify docs and extract KV + tables into **schema-constrained JSON** (with bbox/page). - -**Endpoints:** - -- `POST /v1/extract/{doc_id}` body: `{strategy: "llm|rules|hybrid"}` -- `GET /v1/extract/{doc_id}` → structured JSON - -**Implementation:** - -- Use prompt files in `prompts/`: `doc_classify.txt`, `kv_extract.txt`, `table_extract.txt`. -- **Validator loop**: run LLM → validate JSONSchema → retry with error messages up to N times. -- Return Pydantic models from `libs/schemas.py`. -- Emit `doc.extracted`. - -**Env:** `LLM_ENGINE`, `TEMPERATURE`, `MAX_TOKENS`. - ---- - -### 5) `svc-normalize-map` - -**Purpose:** Normalize & map extracted data to KG. - -**Logic:** - -- Currency normalization (ECB or static fx table), dates, UK tax year/basis period inference. -- Entity resolution (blocking + fuzzy). -- Generate nodes/edges (+ `Evidence` with doc_id/page/bbox/text_hash). -- Use `libs/neo.py` to write with **bitemporal** fields; run **SHACL** validator; on violation, queue `review.requested`. -- Emit `kg.upserted`. - -**Endpoints:** - -- `POST /v1/map/{doc_id}` -- `GET /v1/map/{doc_id}/preview` (diff view, to be used by UI) - -**Env:** `NEO4J_*`. - ---- - -### 6) `svc-kg` - -**Purpose:** Graph façade + RDF/SHACL utility. - -**Endpoints:** - -- `GET /v1/kg/nodes/{label}/{id}` -- `POST /v1/kg/cypher` (admin-gated inline query; must check `admin` role) -- `POST /v1/kg/export/rdf` (returns RDF for SHACL) -- `POST /v1/kg/validate` (run pySHACL against `schemas/shapes.ttl`) -- `GET /v1/kg/lineage/{node_id}` (traverse `DERIVED_FROM` → Evidence) - -**Env:** `NEO4J_*`. - ---- - -### 7) `svc-rag-indexer` - -**Purpose:** Build Qdrant indices (firm knowledge, legislation, best practices, glossary). - -**Workflow:** - -- Load sources (filesystem, URLs, Firm DMS via `svc-firm-connectors`). -- **De-identify PII** (regex + NER), replace with placeholders; store mapping only in Postgres. -- Chunk (layout-aware) per `retrieval/chunking.yaml`. -- Compute **dense** embeddings (e.g., `bge-small-en-v1.5`) and **sparse** (Qdrant sparse). -- Upsert to Qdrant with payload `{jurisdiction, tax_years[], topic_tags[], version, pii_free: true, doc_id/section_id/url}`. -- Emit `rag.indexed`. - -**Endpoints:** - -- `POST /v1/index/run` -- `GET /v1/index/status/{run_id}` - -**Env:** `QDRANT_URL`, `RAG_EMBEDDING_MODEL`, `RAG_RERANKER_MODEL`. - ---- - -### 8) `svc-rag-retriever` - -**Purpose:** Hybrid search + KG fusion with rerank and calibrated confidence. - -**Endpoint:** - -- `POST /v1/rag/search` `{query, tax_year?, jurisdiction?, k?}` → - - ``` - { - "chunks": [...], - "citations": [{doc_id|url, section_id?, page?, bbox?}], - "kg_hints": [{rule_id, formula_id, node_ids[]}], - "calibrated_confidence": 0.0-1.0 - } - ``` - -**Implementation:** - -- Hybrid score: `alpha * dense + beta * sparse`; rerank top-K via cross-encoder; **KG fusion** (boost chunks citing Rules/Calculations relevant to schedule). -- Use `libs/calibration.py` to expose calibrated confidence. - ---- - -### 9) `svc-reason` - -**Purpose:** Deterministic calculators + materializers (UK SA). - -**Endpoints:** - -- `POST /v1/reason/compute_schedule` `{tax_year, taxpayer_id, schedule_id}` -- `GET /v1/reason/explain/{schedule_id}` → rationale & lineage paths - -**Implementation:** - -- Pure functions for: employment, self-employment, property (FHL, 20% interest credit), dividends/interest, allowances, NIC (Class 2/4), HICBC, student loans (Plans 1/2/4/5, PGL). -- **Deterministic order** as defined; rounding per `FormBox.rounding_rule`. -- Use Cypher from `kg/reasoning/schedule_queries.cypher` to materialize box values; attach `DERIVED_FROM` evidence. - ---- - -### 10) `svc-forms` - -**Purpose:** Fill PDFs and assemble evidence bundles. - -**Endpoints:** - -- `POST /v1/forms/fill` `{tax_year, taxpayer_id, form_id}` → returns PDF (binary) -- `POST /v1/forms/evidence_pack` `{scope}` → ZIP + manifest + signed hashes (sha256) - -**Implementation:** - -- `pdfrw` for AcroForm; overlay with ReportLab if needed. -- Manifest includes `doc_id/page/bbox/text_hash` for every numeric field. - ---- - -### 11) `svc-hmrc` - -**Purpose:** HMRC submitter (stub|sandbox|live). - -**Endpoints:** - -- `POST /v1/hmrc/submit` `{tax_year, taxpayer_id, dry_run}` → `{status, submission_id?, errors[]}` -- `GET /v1/hmrc/submissions/{id}` - -**Implementation:** - -- Rate limits, retries/backoff, signed audit log; environment toggle. - ---- - -### 12) `svc-firm-connectors` - -**Purpose:** Read-only connectors to Firm Databases (Practice Mgmt, DMS). - -**Endpoints:** - -- `POST /v1/firm/sync` `{since?}` → `{objects_synced, errors[]}` -- `GET /v1/firm/objects` (paged) - -**Implementation:** - -- Data contracts in `config/firm_contracts/`; mappers → Secure Client Data Store (Postgres) with lineage columns (`source`, `source_id`, `synced_at`). - ---- - -### 13) `ui-review` (outline only) - -- Next.js (SSO handled by Traefik+Authentik), shows extracted fields + evidence snippets; POST overrides to `svc-extract`/`svc-normalize-map`. - ---- - -## DATA CONTRACTS (ESSENTIAL EXAMPLES) - -**Event: `doc.ingested`** - -```json -{ - "event_id": "01J...ULID", - "occurred_at": "2025-09-13T08:00:00Z", - "actor": "svc-ingestion", - "tenant_id": "t_123", - "trace_id": "abc-123", - "schema_version": "1.0", - "data": { - "doc_id": "d_abc", - "bucket": "raw", - "key": "tenants/t_123/raw/d_abc.pdf", - "checksum": "sha256:...", - "kind": "bank_statement", - "mime": "application/pdf", - "pages": 12 - } -} -``` - -**RAG search response shape** - -```json -{ - "chunks": [ - { - "id": "c1", - "text": "...", - "score": 0.78, - "payload": { - "jurisdiction": "UK", - "tax_years": ["2024-25"], - "topic_tags": ["FHL"], - "pii_free": true - } - } - ], - "citations": [ - { "doc_id": "leg-ITA2007", "section_id": "s272A", "url": "https://..." } - ], - "kg_hints": [ - { - "rule_id": "UK.FHL.Qual", - "formula_id": "FHL_Test_v1", - "node_ids": ["n123", "n456"] - } - ], - "calibrated_confidence": 0.81 -} -``` - ---- - -## PERSISTENCE SCHEMAS (POSTGRES; ALEMBIC) - -- `ingest_documents(id pk, tenant_id, doc_id, kind, checksum, bucket, key, mime, pages, created_at)` -- `firm_objects(id pk, tenant_id, source, source_id, type, payload jsonb, synced_at)` -- Qdrant PII mapping table (if absolutely needed): `pii_links(id pk, placeholder_hash, client_id, created_at)` — **encrypt with Vault Transit**; do NOT store raw values. - ---- - -## TRAEFIK + AUTHENTIK (COMPOSE LABELS PER SERVICE) - -For every service container in `infra/compose/docker-compose.local.yml`, add labels: - -``` -- "traefik.enable=true" -- "traefik.http.routers.svc-extract.rule=Host(`api.local`) && PathPrefix(`/extract`)" -- "traefik.http.routers.svc-extract.entrypoints=websecure" -- "traefik.http.routers.svc-extract.tls=true" -- "traefik.http.routers.svc-extract.middlewares=authentik-forwardauth,rate-limit" -- "traefik.http.services.svc-extract.loadbalancer.server.port=8000" -``` - -Use the shared dynamic file `traefik-dynamic.yml` with `authentik-forwardauth` and `rate-limit` middlewares. - ---- - -## OUTPUT FORMAT (STRICT) - -Implement a **multi-file codebase** as fenced blocks, EXACTLY in this order: - -```txt -# FILE: libs/config.py -# factories for Vault/MinIO/Qdrant/Neo4j/Redis/EventBus, Settings base -... -``` - -```txt -# FILE: libs/security.py -# Vault Transit helpers, header parsing, internal CIDR checks, middleware -... -``` - -```txt -# FILE: libs/observability.py -# otel init, prometheus, structlog -... -``` - -```txt -# FILE: libs/events.py -# EventBus abstraction with Kafka and SQS/SNS impls -... -``` - -```txt -# FILE: libs/schemas.py -# Shared Pydantic models mirroring ontology entities -... -``` - -```txt -# FILE: apps/svc-ingestion/main.py -# FastAPI app, endpoints, MinIO write, Postgres, publish doc.ingested -... -``` - -```txt -# FILE: apps/svc-rpa/main.py -# Playwright flows, Prefect tasks, triggers -... -``` - -```txt -# FILE: apps/svc-ocr/main.py -# OCR pipeline, endpoints -... -``` - -```txt -# FILE: apps/svc-extract/main.py -# Classifier + extractors with validator loop -... -``` - -```txt -# FILE: apps/svc-normalize-map/main.py -# normalization, entity resolution, KG mapping, SHACL validation call -... -``` - -```txt -# FILE: apps/svc-kg/main.py -# KG façade, RDF export, SHACL validate, lineage traversal -... -``` - -```txt -# FILE: apps/svc-rag-indexer/main.py -# chunk/de-id/embed/upsert to Qdrant -... -``` - -```txt -# FILE: apps/svc-rag-retriever/main.py -# hybrid retrieval + rerank + KG fusion -... -``` - -```txt -# FILE: apps/svc-reason/main.py -# deterministic calculators, schedule compute/explain -... -``` - -```txt -# FILE: apps/svc-forms/main.py -# PDF fill + evidence pack -... -``` - -```txt -# FILE: apps/svc-hmrc/main.py -# submit stub|sandbox|live with audit + retries -... -``` - -```txt -# FILE: apps/svc-firm-connectors/main.py -# connectors to practice mgmt & DMS, sync to Postgres -... -``` - -```txt -# FILE: infra/compose/docker-compose.local.yml -# Traefik, Authentik, Vault, MinIO, Qdrant, Neo4j, Postgres, Redis, Prom+Grafana, Loki, Unleash, all services -... -``` - -```txt -# FILE: infra/compose/traefik.yml -# static Traefik config -... -``` - -```txt -# FILE: infra/compose/traefik-dynamic.yml -# forwardAuth middleware + routers/services -... -``` - -```txt -# FILE: .gitea/workflows/ci.yml -# lint->test->build->scan->push->deploy -... -``` - -```txt -# FILE: Makefile -# bootstrap, run, test, lint, build, deploy, format, seed -... -``` - -```txt -# FILE: tests/e2e/test_happy_path.py -# end-to-end: ingest -> ocr -> extract -> map -> compute -> fill -> (stub) submit -... -``` - -```txt -# FILE: tests/unit/test_calculators.py -# boundary tests for UK SA logic (NIC, HICBC, PA taper, FHL) -... -``` - -```txt -# FILE: README.md -# how to run locally with docker-compose, Authentik setup, Traefik certs -... -``` - -## DEFINITION OF DONE - -- `docker compose up` brings the full stack up; SSO via Authentik; routes secured via Traefik ForwardAuth. -- Running `pytest` yields ≥ 90% coverage; `make e2e` passes the ingest→…→submit stub flow. -- All services expose `/healthz|/readyz|/livez|/metrics`; OpenAPI at `/docs`. -- No PII stored in Qdrant; vectors carry `pii_free=true`. -- KG writes are SHACL-validated; violations produce `review.requested` events. -- Evidence lineage is present for every numeric box value. -- Gitea pipeline passes: lint, test, build, scan, push, deploy. - -# START - -Generate the full codebase and configs in the **exact file blocks and order** specified above. +def test_happy_path(): + pass \ No newline at end of file