Some checks failed
CI/CD Pipeline / Code Quality & Linting (push) Has been cancelled
CI/CD Pipeline / Policy Validation (push) Has been cancelled
CI/CD Pipeline / Test Suite (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-firm-connectors) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-forms) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-hmrc) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ingestion) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-normalize-map) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-ocr) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-indexer) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-reason) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (svc-rpa) (push) Has been cancelled
CI/CD Pipeline / Build Docker Images (ui-review) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-coverage) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-extract) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-kg) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (svc-rag-retriever) (push) Has been cancelled
CI/CD Pipeline / Security Scanning (ui-review) (push) Has been cancelled
CI/CD Pipeline / Generate SBOM (push) Has been cancelled
CI/CD Pipeline / Deploy to Staging (push) Has been cancelled
CI/CD Pipeline / Deploy to Production (push) Has been cancelled
CI/CD Pipeline / Notifications (push) Has been cancelled
88 lines
2.7 KiB
YAML
88 lines
2.7 KiB
YAML
groups:
|
|
- name: infrastructure
|
|
rules:
|
|
- alert: InstanceDown
|
|
expr: up == 0
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Instance {{ $labels.instance }} down"
|
|
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes."
|
|
|
|
- alert: HighMemoryUsage
|
|
expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes > 0.9
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High memory usage on {{ $labels.instance }}"
|
|
description: "Memory usage is above 90% on {{ $labels.instance }}"
|
|
|
|
- alert: HighCPUUsage
|
|
expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High CPU usage on {{ $labels.instance }}"
|
|
description: "CPU usage is above 80% on {{ $labels.instance }}"
|
|
|
|
- name: application
|
|
rules:
|
|
- alert: HighErrorRate
|
|
expr: rate(http_requests_total{status=~"5.."}[5m]) / rate(http_requests_total[5m]) > 0.1
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "High error rate on {{ $labels.job }}"
|
|
description: "Error rate is above 10% on {{ $labels.job }}"
|
|
|
|
- alert: SlowResponseTime
|
|
expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 1
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Slow response time on {{ $labels.job }}"
|
|
description: "95th percentile response time is above 1 second on {{ $labels.job }}"
|
|
|
|
- name: database
|
|
rules:
|
|
- alert: PostgreSQLDown
|
|
expr: up{job="postgres"} == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "PostgreSQL is down"
|
|
description: "PostgreSQL database is not responding"
|
|
|
|
- alert: Neo4jDown
|
|
expr: up{job="neo4j"} == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Neo4j is down"
|
|
description: "Neo4j graph database is not responding"
|
|
|
|
- alert: QdrantDown
|
|
expr: up{job="qdrant"} == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Qdrant is down"
|
|
description: "Qdrant vector database is not responding"
|
|
|
|
- alert: RedisDown
|
|
expr: up{job="redis"} == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Redis is down"
|
|
description: "Redis cache is not responding"
|